import pandas as pd
import seaborn as sb
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import plotly.graph_objs as go
import plotly.offline as pyoff
from __future__ import division
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
data1 = pd.read_csv("/Users/nartneyu13/Desktop/CLV data - Capstone Project/Online_Sales.csv")
data2 = pd.read_excel("/Users/nartneyu13/Desktop/CLV data - Capstone Project/CustomersData.xlsx")
data3 = pd.read_csv("/Users/nartneyu13/Desktop/CLV data - Capstone Project/Discount_Coupon.csv")
data4 = pd.read_csv("/Users/nartneyu13/Desktop/CLV data - Capstone Project/Marketing_Spend.csv")
data5 = pd.read_excel("/Users/nartneyu13/Desktop/CLV data - Capstone Project/Tax_amount.xlsx")
data1.info()
data1.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 52924 entries, 0 to 52923 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 52924 non-null int64 1 Transaction_ID 52924 non-null int64 2 Transaction_Date 52924 non-null object 3 Product_SKU 52924 non-null object 4 Product_Description 52924 non-null object 5 Product_Category 52924 non-null object 6 Quantity 52924 non-null int64 7 Avg_Price 52924 non-null float64 8 Delivery_Charges 52924 non-null float64 9 Coupon_Status 52924 non-null object dtypes: float64(2), int64(3), object(5) memory usage: 4.0+ MB
| CustomerID | Transaction_ID | Transaction_Date | Product_SKU | Product_Description | Product_Category | Quantity | Avg_Price | Delivery_Charges | Coupon_Status | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 17850 | 16679 | 1/1/2019 | GGOENEBJ079499 | Nest Learning Thermostat 3rd Gen-USA - Stainle... | Nest-USA | 1 | 153.71 | 6.5 | Used |
| 1 | 17850 | 16680 | 1/1/2019 | GGOENEBJ079499 | Nest Learning Thermostat 3rd Gen-USA - Stainle... | Nest-USA | 1 | 153.71 | 6.5 | Used |
| 2 | 17850 | 16681 | 1/1/2019 | GGOEGFKQ020399 | Google Laptop and Cell Phone Stickers | Office | 1 | 2.05 | 6.5 | Used |
| 3 | 17850 | 16682 | 1/1/2019 | GGOEGAAB010516 | Google Men's 100% Cotton Short Sleeve Hero Tee... | Apparel | 5 | 17.53 | 6.5 | Not Used |
| 4 | 17850 | 16682 | 1/1/2019 | GGOEGBJL013999 | Google Canvas Tote Natural/Navy | Bags | 1 | 16.50 | 6.5 | Used |
data2.info()
data2.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1468 entries, 0 to 1467 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 1468 non-null int64 1 Gender 1468 non-null object 2 Location 1468 non-null object 3 Tenure_Months 1468 non-null int64 dtypes: int64(2), object(2) memory usage: 46.0+ KB
| CustomerID | Gender | Location | Tenure_Months | |
|---|---|---|---|---|
| 0 | 17850 | M | Chicago | 12 |
| 1 | 13047 | M | California | 43 |
| 2 | 12583 | M | Chicago | 33 |
| 3 | 13748 | F | California | 30 |
| 4 | 15100 | M | California | 49 |
data3.info()
data3.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 204 entries, 0 to 203 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Month 204 non-null object 1 Product_Category 204 non-null object 2 Coupon_Code 204 non-null object 3 Discount_pct 204 non-null int64 dtypes: int64(1), object(3) memory usage: 6.5+ KB
| Month | Product_Category | Coupon_Code | Discount_pct | |
|---|---|---|---|---|
| 0 | Jan | Apparel | SALE10 | 10 |
| 1 | Feb | Apparel | SALE20 | 20 |
| 2 | Mar | Apparel | SALE30 | 30 |
| 3 | Jan | Nest-USA | ELEC10 | 10 |
| 4 | Feb | Nest-USA | ELEC20 | 20 |
data4.info()
data4.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 365 entries, 0 to 364 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 365 non-null object 1 Offline_Spend 365 non-null int64 2 Online_Spend 365 non-null float64 dtypes: float64(1), int64(1), object(1) memory usage: 8.7+ KB
| Date | Offline_Spend | Online_Spend | |
|---|---|---|---|
| 0 | 1/1/2019 | 4500 | 2424.50 |
| 1 | 1/2/2019 | 4500 | 3480.36 |
| 2 | 1/3/2019 | 4500 | 1576.38 |
| 3 | 1/4/2019 | 4500 | 2928.55 |
| 4 | 1/5/2019 | 4500 | 4055.30 |
data5.info()
data5.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20 entries, 0 to 19 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Product_Category 20 non-null object 1 GST 20 non-null float64 dtypes: float64(1), object(1) memory usage: 448.0+ bytes
| Product_Category | GST | |
|---|---|---|
| 0 | Nest-USA | 0.10 |
| 1 | Office | 0.10 |
| 2 | Apparel | 0.18 |
| 3 | Bags | 0.18 |
| 4 | Drinkware | 0.18 |
data6 = data2.merge(data1, on = "CustomerID")
data7 = data6.merge(data5, on = "Product_Category")
data7=data7.merge(data4, left_on = 'Transaction_Date', right_on = 'Date')
data7['Transaction_Date'] = pd.to_datetime(data7['Transaction_Date'], format = '%m/%d/%Y')
data7['Month'] = data7['Transaction_Date'].dt.strftime('%b')
df = data7.merge(data3, on = ['Month', 'Product_Category'], how = 'left')
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 52924 entries, 0 to 52923 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 52924 non-null int64 1 Gender 52924 non-null object 2 Location 52924 non-null object 3 Tenure_Months 52924 non-null int64 4 Transaction_ID 52924 non-null int64 5 Transaction_Date 52924 non-null datetime64[ns] 6 Product_SKU 52924 non-null object 7 Product_Description 52924 non-null object 8 Product_Category 52924 non-null object 9 Quantity 52924 non-null int64 10 Avg_Price 52924 non-null float64 11 Delivery_Charges 52924 non-null float64 12 Coupon_Status 52924 non-null object 13 GST 52924 non-null float64 14 Date 52924 non-null object 15 Offline_Spend 52924 non-null int64 16 Online_Spend 52924 non-null float64 17 Month 52924 non-null object 18 Coupon_Code 52524 non-null object 19 Discount_pct 52524 non-null float64 dtypes: datetime64[ns](1), float64(5), int64(5), object(9) memory usage: 8.5+ MB
df.isnull().sum()
CustomerID 0 Gender 0 Location 0 Tenure_Months 0 Transaction_ID 0 Transaction_Date 0 Product_SKU 0 Product_Description 0 Product_Category 0 Quantity 0 Avg_Price 0 Delivery_Charges 0 Coupon_Status 0 GST 0 Date 0 Offline_Spend 0 Online_Spend 0 Month 0 Coupon_Code 400 Discount_pct 400 dtype: int64
# Impute missing value with mode
df['Coupon_Code'].fillna(df['Coupon_Code'].mode()[0], inplace = True)
df['Discount_pct'].fillna(df['Discount_pct'].mode()[0], inplace = True)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 52924 entries, 0 to 52923 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 52924 non-null int64 1 Gender 52924 non-null object 2 Location 52924 non-null object 3 Tenure_Months 52924 non-null int64 4 Transaction_ID 52924 non-null int64 5 Transaction_Date 52924 non-null datetime64[ns] 6 Product_SKU 52924 non-null object 7 Product_Description 52924 non-null object 8 Product_Category 52924 non-null object 9 Quantity 52924 non-null int64 10 Avg_Price 52924 non-null float64 11 Delivery_Charges 52924 non-null float64 12 Coupon_Status 52924 non-null object 13 GST 52924 non-null float64 14 Date 52924 non-null object 15 Offline_Spend 52924 non-null int64 16 Online_Spend 52924 non-null float64 17 Month 52924 non-null object 18 Coupon_Code 52924 non-null object 19 Discount_pct 52924 non-null float64 dtypes: datetime64[ns](1), float64(5), int64(5), object(9) memory usage: 8.5+ MB
# Drop the column named "Date" from the DataFrame df
df.drop(columns=['Date'], inplace=True)
df.head()
| CustomerID | Gender | Location | Tenure_Months | Transaction_ID | Transaction_Date | Product_SKU | Product_Description | Product_Category | Quantity | Avg_Price | Delivery_Charges | Coupon_Status | GST | Offline_Spend | Online_Spend | Month | Coupon_Code | Discount_pct | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 17850 | M | Chicago | 12 | 16679 | 2019-01-01 | GGOENEBJ079499 | Nest Learning Thermostat 3rd Gen-USA - Stainle... | Nest-USA | 1 | 153.71 | 6.5 | Used | 0.1 | 4500 | 2424.5 | Jan | ELEC10 | 10.0 |
| 1 | 17850 | M | Chicago | 12 | 16680 | 2019-01-01 | GGOENEBJ079499 | Nest Learning Thermostat 3rd Gen-USA - Stainle... | Nest-USA | 1 | 153.71 | 6.5 | Used | 0.1 | 4500 | 2424.5 | Jan | ELEC10 | 10.0 |
| 2 | 17850 | M | Chicago | 12 | 16696 | 2019-01-01 | GGOENEBQ078999 | Nest Cam Outdoor Security Camera - USA | Nest-USA | 2 | 122.77 | 6.5 | Not Used | 0.1 | 4500 | 2424.5 | Jan | ELEC10 | 10.0 |
| 3 | 17850 | M | Chicago | 12 | 16699 | 2019-01-01 | GGOENEBQ079099 | Nest Protect Smoke + CO White Battery Alarm-USA | Nest-USA | 1 | 81.50 | 6.5 | Clicked | 0.1 | 4500 | 2424.5 | Jan | ELEC10 | 10.0 |
| 4 | 17850 | M | Chicago | 12 | 16700 | 2019-01-01 | GGOENEBJ079499 | Nest Learning Thermostat 3rd Gen-USA - Stainle... | Nest-USA | 1 | 153.71 | 6.5 | Clicked | 0.1 | 4500 | 2424.5 | Jan | ELEC10 | 10.0 |
df.describe()
| CustomerID | Tenure_Months | Transaction_ID | Quantity | Avg_Price | Delivery_Charges | GST | Offline_Spend | Online_Spend | Discount_pct | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 52924.00000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 |
| mean | 15346.70981 | 26.127995 | 32409.825675 | 4.497638 | 52.237646 | 10.517630 | 0.137462 | 2830.914141 | 1893.109119 | 19.953518 |
| std | 1766.55602 | 13.478285 | 8648.668977 | 20.104711 | 64.006882 | 19.475613 | 0.045825 | 936.154247 | 807.014092 | 8.096509 |
| min | 12346.00000 | 2.000000 | 16679.000000 | 1.000000 | 0.390000 | 0.000000 | 0.050000 | 500.000000 | 320.250000 | 10.000000 |
| 25% | 13869.00000 | 15.000000 | 25384.000000 | 1.000000 | 5.700000 | 6.000000 | 0.100000 | 2500.000000 | 1252.630000 | 10.000000 |
| 50% | 15311.00000 | 27.000000 | 32625.500000 | 1.000000 | 16.990000 | 6.000000 | 0.180000 | 3000.000000 | 1837.870000 | 20.000000 |
| 75% | 16996.25000 | 37.000000 | 39126.250000 | 2.000000 | 102.130000 | 6.500000 | 0.180000 | 3500.000000 | 2425.350000 | 30.000000 |
| max | 18283.00000 | 50.000000 | 48497.000000 | 900.000000 | 355.740000 | 521.360000 | 0.180000 | 5000.000000 | 4556.930000 | 30.000000 |
# Original dimension of the merged dataset
df.shape
(52924, 19)
df.duplicated().value_counts()
False 52924 dtype: int64
# Round the values to the nearest integer
df['Quantity'] = np.round(df['Quantity']).astype(int)
# Plot the original Quantity histogram distribution
df['Quantity'].plot(kind='hist', bins=10, title='Original Quantity Distribution')
plt.show()
# Log transformation - adding 1 to avoid log(0) and round the transformed values to the nearest integer
df['Quantity_transformed'] = np.round(np.log1p(df['Quantity'])).astype(int)
# Plot the transformed Quantity histogram distribution
df['Quantity_transformed'].plot(kind='hist', bins=10, title='Log Transformed Quantity Distribution')
plt.show()
df['Quantity_transformed'].dtype
dtype('int64')
upperlimit_Q = df['Quantity_transformed'].quantile(0.99)
lowerlimit_Q = df['Quantity_transformed'].quantile(0.01)
upperlimit_Q, lowerlimit_Q
(4.0, 1.0)
# Plot the original Delivery_Charges histogram distribution
df['Delivery_Charges'].plot(kind='hist', bins=10, title='Original Delivery_Charges Distribution')
plt.show()
# Log transformation - adding 1 to avoid log(0)
df['Delivery_Charges_transformed'] = np.log1p(df['Delivery_Charges'])
# Plot the transformed Delivery_Charges histogram distribution
df['Delivery_Charges_transformed'].plot(kind='hist', bins=10, title='Log Transformed Delivery_Charges Distribution')
plt.show()
# Box plot
df['Delivery_Charges_transformed'].plot(kind='box', title='Log Transformed Delivery_Charges Distribution')
<AxesSubplot:title={'center':'Log Transformed Delivery_Charges Distribution'}>
# Count how many instances that have "Quantity" greater than 2
greater_than_5_count = np.sum(df['Delivery_Charges'] > 100)
greater_than_5_count
358
upperlimit_D = df['Delivery_Charges_transformed'].quantile(0.99)
lowerlimit_D = df['Delivery_Charges_transformed'].quantile(0.01)
upperlimit_D, lowerlimit_D
(4.330733340286331, 1.9459101490553132)
# Plot the original Avg_Price histogram distribution
df['Avg_Price'].plot(kind='hist', bins=10, title='Original Average Price Distribution')
plt.show()
# Log transformation - adding 1 to avoid log(0)
df['Avg_Price_transformed'] = np.log1p(df['Avg_Price'])
# Plot the transformed Quantity histogram distribution
df['Avg_Price_transformed'].plot(kind='hist', bins=10, title='Log Transformed Average Price Distribution')
plt.show()
# Plot the original Online_Spend histogram distribution
df['Online_Spend'].plot(kind='hist', bins=10, title='Original Online Spend Distribution')
plt.show()
# Log transformation - adding 1 to avoid log(0)
df['Online_Spend_transformed'] = np.sqrt(df['Online_Spend'])
# Plot the transformed Quantity histogram distribution
df['Online_Spend_transformed'].plot(kind='hist', bins = 10, title='Log Transformed Online Spend Distribution')
plt.show()
# define a funtion replace_outliers which return a dataframe with no outliers (using capping method)
# def replace_outliers(df, columns):
# for column in columns:
# Calculate lower and upper bounds using 0.01 and 0.99 quantiles
# lower_bound = df[column].quantile(0.01)
# upper_bound = df[column].quantile(0.99)
# Replace outliers with lower and upper bounds
# df[column] = df[column].apply(lambda x: upper_bound if x > upper_bound else (lower_bound if x < lower_bound else x))
# return df
# List of columns where outliers should be replaced
#columns_to_replace = ['Offline_Spend', 'Online_Spend']
# Replace outliers in specified columns
#df_cleaned = replace_outliers(df, columns_to_replace)
#df.shape
#df_cleaned.shape
#df_cleaned.describe()
df['Invoice'] = np.where(
df['Coupon_Status'] == 'Used',
((df['Quantity'] * df['Avg_Price']) * (1 - df['Discount_pct']/100) * (1 +df['GST'])) + df['Delivery_Charges'],
((df['Quantity'] * df['Avg_Price']) * (1 + df['GST'])) + df['Delivery_Charges']
)
df['Product_Category'] = df['Product_Category'].astype('category')
df['Coupon_Status'] = df['Coupon_Status'].astype('category')
df['Gender'] = df['Gender'].astype('category')
df['Location'] = df['Location'].astype('category')
df['Month'] = df['Month'].astype('category')
df['Coupon_Code'] = df['Coupon_Code'].astype('category')
df.describe()
| CustomerID | Tenure_Months | Transaction_ID | Quantity | Avg_Price | Delivery_Charges | GST | Offline_Spend | Online_Spend | Discount_pct | Quantity_transformed | Delivery_Charges_transformed | Avg_Price_transformed | Online_Spend_transformed | Invoice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 52924.00000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 | 52924.000000 |
| mean | 15346.70981 | 26.127995 | 32409.825675 | 4.497638 | 52.237646 | 10.517630 | 0.137462 | 2830.914141 | 1893.109119 | 19.953518 | 1.249887 | 2.179542 | 3.126378 | 42.467054 | 101.947455 |
| std | 1766.55602 | 13.478285 | 8648.668977 | 20.104711 | 64.006882 | 19.475613 | 0.045825 | 936.154247 | 807.014092 | 8.096509 | 0.673419 | 0.539681 | 1.399033 | 9.468905 | 172.281095 |
| min | 12346.00000 | 2.000000 | 16679.000000 | 1.000000 | 0.390000 | 0.000000 | 0.050000 | 500.000000 | 320.250000 | 10.000000 | 1.000000 | 0.000000 | 0.329304 | 17.895530 | 4.603500 |
| 25% | 13869.00000 | 15.000000 | 25384.000000 | 1.000000 | 5.700000 | 6.000000 | 0.100000 | 2500.000000 | 1252.630000 | 10.000000 | 1.000000 | 1.945910 | 1.902108 | 35.392513 | 20.160000 |
| 50% | 15311.00000 | 27.000000 | 32625.500000 | 1.000000 | 16.990000 | 6.000000 | 0.180000 | 3000.000000 | 1837.870000 | 20.000000 | 1.000000 | 1.945910 | 2.889816 | 42.870386 | 45.636200 |
| 75% | 16996.25000 | 37.000000 | 39126.250000 | 2.000000 | 102.130000 | 6.500000 | 0.180000 | 3500.000000 | 2425.350000 | 30.000000 | 1.000000 | 2.014903 | 4.635990 | 49.247843 | 137.400000 |
| max | 18283.00000 | 50.000000 | 48497.000000 | 900.000000 | 355.740000 | 521.360000 | 0.180000 | 5000.000000 | 4556.930000 | 30.000000 | 7.000000 | 6.258357 | 5.877007 | 67.505037 | 8979.275000 |
from scipy.stats import mode
# Define custom aggregation functions
def first_value(x):
return x.iloc[0]
def last_value(x):
return x.iloc[-1]
# Define function to get mode ignoring NaN values
def mode_ignore_nan(x):
return x.mode().iloc[0] if not x.mode().empty else None
# Group by CustomerId and Aggregate data
agg_df = df.groupby('CustomerID').agg({
'Transaction_Date': lambda x: (x.max() - x.min()).days,
'Transaction_ID': 'count',
'Invoice': 'sum',
'Quantity':'sum',
'Gender': first_value,
'Location': first_value,
'Tenure_Months': last_value,
'Product_Category': mode_ignore_nan,
'Coupon_Status': mode_ignore_nan
})
# Rename the columns
agg_df.columns = ['Number of Days', 'Number of Transaction', 'Total Purchase Amount', 'Quantity', 'Gender', 'Location', 'Tenure', 'Product Category', 'Coupon Status']
agg_df.head()
| Number of Days | Number of Transaction | Total Purchase Amount | Quantity | Gender | Location | Tenure | Product Category | Coupon Status | |
|---|---|---|---|---|---|---|---|---|---|
| CustomerID | |||||||||
| 12346 | 0 | 2 | 174.98174 | 3 | F | New York | 31 | Apparel | Used |
| 12347 | 223 | 60 | 15686.84396 | 342 | M | New York | 20 | Nest-USA | Clicked |
| 12348 | 119 | 23 | 1689.55594 | 209 | M | California | 39 | Office | Clicked |
| 12350 | 0 | 17 | 1467.43528 | 21 | M | California | 25 | Apparel | Clicked |
| 12356 | 0 | 36 | 2007.42900 | 56 | F | Chicago | 31 | Apparel | Clicked |
agg_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1468 entries, 12346 to 18283 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Number of Days 1468 non-null int64 1 Number of Transaction 1468 non-null int64 2 Total Purchase Amount 1468 non-null float64 3 Quantity 1468 non-null int64 4 Gender 1468 non-null object 5 Location 1468 non-null object 6 Tenure 1468 non-null int64 7 Product Category 1468 non-null object 8 Coupon Status 1468 non-null object dtypes: float64(1), int64(4), object(4) memory usage: 114.7+ KB
# Average Order Value
agg_df['AOV'] = agg_df['Total Purchase Amount']/agg_df['Number of Transaction']
# Purchase Frequency
purchase_freq = sum(agg_df['Number of Transaction'])/len(agg_df)
# Repeat Rate
repeat_rate = agg_df[agg_df['Number of Transaction']>1].shape[0]/agg_df.shape[0]
# Churn Rate
agg_df['Churn Rate'] = 1 - repeat_rate
# Profit Margin
agg_df['Profit Margin'] = agg_df['Total Purchase Amount'] * 0.15
where:
# CLV1
agg_df['CLV1'] = (agg_df['AOV']*purchase_freq)*(agg_df['Number of Days'].mean()/len(agg_df))
# CLV2
agg_df['CLV2'] = agg_df['AOV'] * agg_df['Number of Transaction'] * 0.15 * agg_df['Churn Rate']
agg_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1468 entries, 12346 to 18283 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Number of Days 1468 non-null int64 1 Number of Transaction 1468 non-null int64 2 Total Purchase Amount 1468 non-null float64 3 Quantity 1468 non-null int64 4 Gender 1468 non-null object 5 Location 1468 non-null object 6 Tenure 1468 non-null int64 7 Product Category 1468 non-null object 8 Coupon Status 1468 non-null object 9 AOV 1468 non-null float64 10 Churn Rate 1468 non-null float64 11 Profit Margin 1468 non-null float64 12 CLV1 1468 non-null float64 13 CLV2 1468 non-null float64 dtypes: float64(6), int64(4), object(4) memory usage: 172.0+ KB
location_counts = df['Location'].value_counts()
plt.figure(figsize=(10, 6))
location_counts.plot(kind='bar')
plt.title('Frequency of Locations')
plt.xlabel('Location')
plt.ylabel('Frequency')
plt.xticks(rotation=45) # Rotate x-axis labels for readability
plt.show()
# Selecting numeric columns
numeric_columns = df.select_dtypes(include=['number']).columns
# Creating a new dataframe with only numeric columns
newdf = df[numeric_columns]
# Dropping specific columns from the new DataFrame
columns_to_drop = ['CustomerID', 'Transaction_ID', 'Online_Spend', 'Quantity_transformed','Delivery_Charges_transformed','Avg_Price_transformed']
newdf.drop(columns_to_drop, axis=1, inplace=True)
# Creating the correlation heatmap
plt.figure(figsize=(10, 8))
sb.heatmap(newdf.corr(), annot=True, cmap='Blues')
plt.title('Correlation Heatmap')
plt.show()
df.groupby('Month')['CustomerID'].count()
monthly_customer_count = df.groupby('Month')['CustomerID'].nunique()
plt.figure(figsize=(8, 4))
monthly_customer_count.plot(kind='bar', color = 'lightcoral')
plt.xlabel('Month')
plt.ylabel('Customer Count')
plt.title('Customer Count by Month')
plt.xticks(ticks=range(0, 12), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], rotation=45)
plt.show()
# Understand the retention of customers on a month-on-month basis
month_dict = {}
for i in df['Month'].unique():
month_dict[i] = df[df['Month']==i]['CustomerID'].unique().tolist()
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
retention = [0]
for i in range(11):
set1 = set(month_dict[months[i]])
set2 = set(month_dict[months[i+1]])
common_items = len(set1.intersection(set2))
retention.append(common_items)
plt.figure(figsize=(8, 3))
plt.bar(months, retention, color='lightcoral')
plt.xlabel('Months')
plt.ylabel('Retention Count')
plt.title('Customer Retention by Month')
plt.xticks(rotation=45)
plt.show()
# Extract Month from transaction date
df['Transaction_Month'] = df['Transaction_Date'].dt.month
# Unique month
df['Transaction_Month'].unique()
# Calculate Revenue for each Month
df_revenue = df.groupby('Transaction_Month')['Invoice'].sum().reset_index()
df_revenue
# Calculate Monthly Revenue Growth
growth_rate_lst = [0]
for i in range(len(df_revenue)-1):
growth_rate = (df_revenue['Invoice'][i+1] - df_revenue['Invoice'][i])/df_revenue['Invoice'][i]
growth_rate_lst.append(growth_rate)
df_revenue['growth_rate'] = growth_rate_lst
# Print dataframe
df_revenue
| Transaction_Month | Invoice | growth_rate | |
|---|---|---|---|
| 0 | 1 | 493977.33055 | 0.000000 |
| 1 | 2 | 375052.31962 | -0.240750 |
| 2 | 3 | 414649.53948 | 0.105578 |
| 3 | 4 | 477082.59730 | 0.150568 |
| 4 | 5 | 365386.18740 | -0.234124 |
| 5 | 6 | 360909.60021 | -0.012252 |
| 6 | 7 | 451645.43540 | 0.251409 |
| 7 | 8 | 475670.79902 | 0.053195 |
| 8 | 9 | 396481.70738 | -0.166479 |
| 9 | 10 | 480728.77581 | 0.212487 |
| 10 | 11 | 547770.52846 | 0.139459 |
| 11 | 12 | 556112.29179 | 0.015229 |
# Visualize monthly growth
plot1 = [
go.Scatter(
x = df_revenue[df_revenue['Transaction_Month'] != '12']['Transaction_Month'],
y = df_revenue[df_revenue['growth_rate'] != '12']['growth_rate']
)
]
plot_layout = go.Layout(xaxis = {"type":"category"}, title = 'Monthly Growth Rate')
fig = go.Figure(data = plot1, layout = plot_layout)
pyoff.plot(fig)
'temp-plot.html'
df_monthlysales = df.groupby('Transaction_Month')['Quantity'].sum().reset_index()
# Visualize monthly growth
plot2 = [
go.Scatter(
x = df_monthlysales['Transaction_Month'],
y = df_monthlysales['Quantity']
)
]
plot2_layout = go.Layout(xaxis = {"type":"category"}, title = 'Monthly Total Number of Orders')
fig2 = go.Figure(data = plot2, layout = plot2_layout)
pyoff.plot(fig2)
'temp-plot.html'
# Creating a new dataframe df_first_purchase which consists of the first purchase date for each customer
df_first_purchase = df.groupby('CustomerID').Transaction_Date.min().reset_index()
df_first_purchase.columns = ['CustomerID', 'First Purchase Date']
# Extracting the month from first purchase date
df_first_purchase['First_Purchase_Month'] = df_first_purchase['First Purchase Date'].dt.month
# Merging with the original dataset
df = pd.merge(df, df_first_purchase, on = 'CustomerID')
# Creating new column called UserType to distinguish new and existing customer
df['UserType'] = 'New'
df.loc[df['Transaction_Month'] != df['First_Purchase_Month'], 'UserType'] = 'Existing'
# Count number of new and existing customers
df['UserType'].value_counts()
New 32033 Existing 20891 Name: UserType, dtype: int64
df_usertype_revenue = df.groupby(['Transaction_Month', 'UserType'])['Invoice'].sum().reset_index()
df_usertype_revenue = df_usertype_revenue.query("Transaction_Month != 1 and Transaction_Month != 12")
df_usertype_revenue
| Transaction_Month | UserType | Invoice | |
|---|---|---|---|
| 1 | 2 | Existing | 46524.59322 |
| 2 | 2 | New | 328527.72640 |
| 3 | 3 | Existing | 68630.05180 |
| 4 | 3 | New | 346019.48768 |
| 5 | 4 | Existing | 195895.51546 |
| 6 | 4 | New | 281187.08184 |
| 7 | 5 | Existing | 132229.87076 |
| 8 | 5 | New | 233156.31664 |
| 9 | 6 | Existing | 143638.73501 |
| 10 | 6 | New | 217270.86520 |
| 11 | 7 | Existing | 271483.00300 |
| 12 | 7 | New | 180162.43240 |
| 13 | 8 | Existing | 246865.43182 |
| 14 | 8 | New | 228805.36720 |
| 15 | 9 | Existing | 234028.56387 |
| 16 | 9 | New | 162453.14351 |
| 17 | 10 | Existing | 223293.20299 |
| 18 | 10 | New | 257435.57282 |
| 19 | 11 | Existing | 313156.60430 |
| 20 | 11 | New | 234613.92416 |
# Visualize
plot3 = [
go.Scatter(
x = df_usertype_revenue.query("UserType == 'Existing'")['Transaction_Month'],
y = df_usertype_revenue.query("UserType == 'Existing'")['Invoice'],
name = 'Existing'
),
go.Scatter(
x = df_usertype_revenue.query("UserType == 'New'")['Transaction_Month'],
y = df_usertype_revenue.query("UserType == 'New'")['Invoice'],
name = 'New'
)
]
plot3_layout = go.Layout(xaxis = {"type":"category"}, title = 'Revenue of New vs. Existing Customers')
fig3 = go.Figure(data = plot3, layout = plot3_layout)
pyoff.plot(fig3)
'temp-plot.html'
# Filter out the first and last months (assuming they are incomplete)
df_usertype_revenue_filtered = df_usertype_revenue[(df_usertype_revenue['Transaction_Month'] != 1) & (df_usertype_revenue['Transaction_Month'] != 12)]
# Create separate dataframes for new and existing customers
df_new_customer_revenue = df_usertype_revenue_filtered[df_usertype_revenue_filtered['UserType'] == 'New']
df_existing_customer_revenue = df_usertype_revenue_filtered[df_usertype_revenue_filtered['UserType'] == 'Existing']
# Create a line plot for new and existing customer revenue
plt.figure(figsize=(10, 6))
plt.plot(df_new_customer_revenue['Transaction_Month'], df_new_customer_revenue['Invoice'], marker='o', label='New Customer Revenue')
plt.plot(df_existing_customer_revenue['Transaction_Month'], df_existing_customer_revenue['Invoice'], marker='o', label='Existing Customer Revenue')
plt.title('Revenue by User Type Over Time (Excluding First and Last Months)')
plt.xlabel('Transaction Month')
plt.ylabel('Revenue')
plt.xticks(range(2, 12), ['Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov'])
plt.legend()
plt.grid(True)
plt.show()
# Pecentage of new customer each month
count_new = df.query("UserType=='New'").groupby(['Transaction_Month'])['CustomerID'].nunique()
count_existing = df.query("UserType=='Existing'").groupby(['Transaction_Month'])['CustomerID'].nunique()
ratio_new_to_existing = count_new / count_existing
ratio_new_to_existing
Transaction_Month 1 NaN 2 7.384615 3 5.709677 4 2.672131 5 1.272727 6 1.122951 7 0.661972 8 0.818182 9 0.678261 10 0.707317 11 0.566667 12 0.815385 Name: CustomerID, dtype: float64
ratio_new_to_existing = ratio_new_to_existing.reset_index()
ratio_new_to_existing = ratio_new_to_existing.dropna()
# Visualize
plot4 = [
go.Bar(
x = ratio_new_to_existing['Transaction_Month'],
y = ratio_new_to_existing['CustomerID']
)
]
plot4_layout = go.Layout(
xaxis = {"type":"category"},
title = 'New Customer Ratio'
)
fig4 = go.Figure(data = plot4, layout = plot4_layout)
pyoff.plot(fig4)
'temp-plot.html'
# Calculate Revenue by each customer by each month -> assign to a dataframe
df_user_purchase = df.groupby(['CustomerID', 'Transaction_Month'])['Invoice'].sum().astype(int).reset_index()
# Dataframe with dummy variables that show whether customer made a transaction in a specific month
df_retention = pd.crosstab(df_user_purchase['CustomerID'], df_user_purchase['Transaction_Month']).reset_index()
# Define months
months = df_retention.columns[1:]
# Calculate retention
retention_array = []
for i in range(len(months) - 1):
retention_data = {}
selected_month = months[i + 1]
prev_month = months[i]
retention_data['Transaction_Month'] = selected_month
retention_data['Total User Count'] = df_retention[selected_month].sum()
retention_data['Retained User Count'] = df_retention[df_retention[prev_month] > 0][selected_month].sum()
retention_array.append(retention_data)
# Convert to DataFrame
df_retention = pd.DataFrame(retention_array)
# Calculate Retention Rate
df_retention['Retention Rate'] = df_retention['Retained User Count'] / df_retention['Total User Count']
# Calculate Churn Rate
df_retention['Churn Rate'] = 1 - df_retention['Retention Rate']
print(df_retention)
Transaction_Month Total User Count Retained User Count Retention Rate \
0 2 109 13 0.119266
1 3 208 11 0.052885
2 4 224 24 0.107143
3 5 200 25 0.125000
4 6 259 37 0.142857
5 7 236 58 0.245763
6 8 300 65 0.216667
7 9 193 44 0.227979
8 10 210 29 0.138095
9 11 188 31 0.164894
10 12 236 28 0.118644
Churn Rate
0 0.880734
1 0.947115
2 0.892857
3 0.875000
4 0.857143
5 0.754237
6 0.783333
7 0.772021
8 0.861905
9 0.835106
10 0.881356
import plotly.graph_objects as go
# Create traces for retention rate and churn rate
trace_retention = go.Scatter(
x=df_retention['Transaction_Month'],
y=df_retention['Retention Rate'],
mode='lines+markers',
name='Retention Rate',
line=dict(color='navy')
)
trace_churn = go.Scatter(
x=df_retention['Transaction_Month'],
y=df_retention['Churn Rate'],
mode='lines+markers',
name='Churn Rate',
line=dict(color='darkorange', dash='dash', width = 5)
)
# Create layout
layout = go.Layout(
title='<b>Retention and Churn Rates Over Time</b>',
xaxis=dict(title='<b>Transaction Month</b>'),
yaxis=dict(title='<b>Rate</b>'),
plot_bgcolor='rgba(0,0,0,0)',
paper_bgcolor='rgba(0,0,0,0)',
font=dict(color='black'),
legend=dict(x=0.02, y=0.6)
)
# Combine traces and layout into a figure
fig = go.Figure(data=[trace_retention, trace_churn], layout=layout)
# Show the plot
fig.show()
df_user = pd.DataFrame(df['CustomerID'].unique())
df_user.columns = ['CustomerID']
# Calculate Recency
# First, calculate the last transaction date for all customers
df_recency = df.groupby('CustomerID').Transaction_Date.max().reset_index()
# Calculate Recency by subtracting the most recent purchase date of each customer by the most recent date of all customers
df_recency['Recency'] = (df_recency['Transaction_Date'].max() - df_recency['Transaction_Date']).dt.days
df_recency.columns = ['CustomerID', 'Last_Purchase_Date', 'Recency']
# Merge
df_user = pd.merge(df_user, df_recency[['CustomerID', 'Recency']], on = 'CustomerID')
df_user.Recency.describe()
count 1468.000000 mean 144.292234 std 101.936959 min 0.000000 25% 55.000000 50% 131.000000 75% 220.000000 max 364.000000 Name: Recency, dtype: float64
# Calculate Frequency
# Group by customerID and count frequency of purchase for each customer
df_frequency = df.groupby('CustomerID').Transaction_Date.count().reset_index()
# Rename the columns
df_frequency.columns = ['CustomerID','Frequency']
# Merging with user dataframe
df_user = pd.merge(df_user, df_frequency, on = 'CustomerID')
df_user.Frequency.describe()
count 1468.000000 mean 36.051771 std 50.885680 min 1.000000 25% 10.000000 50% 21.000000 75% 46.000000 max 695.000000 Name: Frequency, dtype: float64
# Calculate Monetary Value
# Calculate total invoice for each customer
df_monetary = df.groupby('CustomerID').Invoice.sum().reset_index()
# Rename the columns
df_monetary.columns = ['CustomerID','Monetary']
# Merging with user dataframe
df_user = pd.merge(df_user, df_monetary, on = 'CustomerID')
df_user.Monetary.describe()
count 1468.000000 mean 3675.386316 std 5841.589871 min 6.990000 25% 783.974310 50% 2011.622610 75% 4495.056630 max 86910.893840 Name: Monetary, dtype: float64
print(df_user.columns)
Index(['CustomerID', 'Recency', 'Frequency', 'Monetary'], dtype='object')
df_merge = pd.merge(df_user, agg_df, on = 'CustomerID', how = 'left')
df_cluster = df_merge[['Recency', 'Frequency', 'Monetary', 'CLV1']]
# Plotting box plots for Recency, Frequency, and Monetary
plt.figure(figsize=(10, 6))
sb.boxplot(data=df_cluster)
plt.title('Box Plot of Recency, Frequency, Monetary, and CLV ')
plt.xlabel('Variables')
plt.ylabel('Values')
plt.show()
# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_cluster)
scaled_features = pd.DataFrame(scaled_features)
# Plotting box plots for Scaled Recency, Frequency, and Monetary
plt.figure(figsize=(10, 6))
sb.boxplot(data=scaled_features)
plt.title('Box Plot of Recency, Frequency, Monetary, and CLV')
plt.xlabel('Variables')
plt.ylabel('Values')
plt.show()
# Find the optimal k using elbow curve
sse = {}
df_cluster_copy = df_cluster.copy() # Create a copy of the DataFrame
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=100).fit(df_cluster_copy)
df_cluster_copy["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_
plt.figure(figsize=(8, 6))
plt.plot(list(sse.keys()), list(sse.values()), marker='o', color='royalblue')
plt.title('Inertia vs. Number of Clusters', fontsize=18)
plt.xlabel('Number of Clusters', fontsize=15)
plt.ylabel('Inertia', fontsize=15)
plt.show()
# Fit the model with 4 clusters
kmeans = KMeans(n_clusters = 3)
kmeans.fit(df_cluster_copy)
df_cluster_copy['Cluster_no'] = kmeans.predict(df_cluster_copy)
df_cluster_copy.groupby('Cluster_no').describe()
| Recency | Frequency | ... | CLV1 | clusters | |||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | mean | std | min | 25% | 50% | 75% | max | count | mean | ... | 75% | max | count | mean | std | min | 25% | 50% | 75% | max | |
| Cluster_no | |||||||||||||||||||||
| 0 | 1227.0 | 151.683782 | 101.196469 | 0.0 | 65.00 | 140.0 | 224.0 | 364.0 | 1227.0 | 21.558272 | ... | 190.087282 | 1660.694105 | 1227.0 | 3.341483 | 2.489115 | 0.0 | 0.0 | 4.0 | 4.0 | 7.0 |
| 1 | 7.0 | 71.571429 | 123.357012 | 0.0 | 11.00 | 16.0 | 62.0 | 339.0 | 7.0 | 501.285714 | ... | 223.154023 | 234.016186 | 7.0 | 4.000000 | 1.914854 | 2.0 | 2.0 | 5.0 | 5.5 | 6.0 |
| 2 | 234.0 | 107.709402 | 96.731082 | 1.0 | 32.25 | 79.0 | 159.0 | 362.0 | 234.0 | 98.132479 | ... | 210.686547 | 794.758900 | 234.0 | 4.303419 | 2.485544 | 1.0 | 3.0 | 3.0 | 8.0 | 8.0 |
3 rows × 40 columns
df_cluster_copy
| Recency | Frequency | Monetary | CLV1 | clusters | Cluster_no | |
|---|---|---|---|---|---|---|
| 0 | 339 | 297 | 40531.38476 | 215.697333 | 5 | 1 |
| 1 | 13 | 44 | 3348.77498 | 120.293752 | 0 | 0 |
| 2 | 151 | 42 | 3320.84496 | 124.970956 | 0 | 0 |
| 3 | 123 | 6 | 967.73992 | 254.927794 | 4 | 0 |
| 4 | 32 | 58 | 8071.68837 | 219.961248 | 3 | 2 |
| ... | ... | ... | ... | ... | ... | ... |
| 1463 | 114 | 5 | 65.08636 | 20.574522 | 4 | 0 |
| 1464 | 86 | 34 | 4535.09298 | 210.822549 | 7 | 0 |
| 1465 | 86 | 11 | 1053.68428 | 151.400600 | 4 | 0 |
| 1466 | 86 | 28 | 2440.62594 | 137.769465 | 0 | 0 |
| 1467 | 35 | 94 | 11252.83782 | 189.209957 | 8 | 2 |
1468 rows × 6 columns
# Define a color palette for clusters
cluster_palette = sb.color_palette('viridis', df_cluster_copy['Cluster_no'].nunique())
# Create a figure and subplots
fig, axs = plt.subplots(2, 2, figsize=(12, 10))
# Plotting Recency vs Frequency
sb.scatterplot(data=df_cluster_copy, x='Recency', y='Frequency', hue='Cluster_no', palette=cluster_palette, ax=axs[0, 0])
axs[0, 0].set_title('Recency vs Frequency')
# Plotting Recency vs Monetary
sb.scatterplot(data=df_cluster_copy, x='Recency', y='Monetary', hue='Cluster_no', palette=cluster_palette, ax=axs[0, 1])
axs[0, 1].set_title('Recency vs Monetary')
# Plotting Frequency vs Monetary
sb.scatterplot(data=df_cluster_copy, x='Frequency', y='Monetary', hue='Cluster_no', palette=cluster_palette, ax=axs[1, 0])
axs[1, 0].set_title('Frequency vs Monetary')
# Add centroids to all plots
for cluster in np.unique(df_cluster_copy['Cluster_no']):
centroid_recency_frequency = df_cluster_copy[df_cluster_copy['Cluster_no'] == cluster][['Recency', 'Frequency']].mean()
centroid_recency_monetary = df_cluster_copy[df_cluster_copy['Cluster_no'] == cluster][['Recency', 'Monetary']].mean()
centroid_frequency_monetary = df_cluster_copy[df_cluster_copy['Cluster_no'] == cluster][['Frequency', 'Monetary']].mean()
axs[0, 0].plot(centroid_recency_frequency['Recency'], centroid_recency_frequency['Frequency'], marker='o', markersize=8, markeredgecolor='black', label=f'Cluster {cluster}', linestyle='', color='red')
axs[0, 1].plot(centroid_recency_monetary['Recency'], centroid_recency_monetary['Monetary'], marker='o', markersize=8, markeredgecolor='black', label=f'Cluster {cluster}', linestyle='', color='red')
axs[1, 0].plot(centroid_frequency_monetary['Frequency'], centroid_frequency_monetary['Monetary'], marker='o', markersize=8, markeredgecolor='black', label=f'Cluster {cluster}', linestyle='', color='red')
# Hide empty subplot
axs[1, 1].axis('off')
# Adjust layout
plt.tight_layout()
# Show the plot
plt.show()
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')
# Plotting Recency vs Frequency vs Monetary
for cluster, color in zip(np.unique(df_cluster_copy['Cluster_no']), cluster_palette):
cluster_data = df_cluster_copy[df_cluster_copy['Cluster_no'] == cluster]
ax.scatter(cluster_data['Recency'], cluster_data['Frequency'], cluster_data['Monetary'], c=color, label=f'Cluster {cluster}')
# Add centroids to the plot
for cluster in np.unique(df_cluster_copy['Cluster_no']):
centroid = df_cluster_copy[df_cluster_copy['Cluster_no'] == cluster][['Recency', 'Frequency', 'Monetary']].mean()
ax.scatter(centroid['Recency'], centroid['Frequency'], centroid['Monetary'], marker='o', s=100, c='red', label=f'Cluster {cluster} Centroid')
# Set labels and title
ax.set_xlabel('Recency')
ax.set_ylabel('Frequency')
ax.set_zlabel('Monetary')
ax.set_title('3D Scatter Plot of Clusters')
# Add a legend
ax.legend()
# Show the plot
plt.show()
*c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points. *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points. *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*. Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.
# Define a color palette for clusters
cluster_palette = sb.color_palette('viridis', df_cluster_copy['Cluster_no'].nunique())
# Create a figure and subplots
fig, axs = plt.subplots(2, 2, figsize=(12, 10))
# Plotting Recency vs CLV1
sb.scatterplot(data=df_cluster_copy, x='Recency', y='CLV1', hue='Cluster_no', palette=cluster_palette, ax=axs[0, 0])
axs[0, 0].set_title('Recency vs CLV1')
# Plotting Frequency vs CLV1
sb.scatterplot(data=df_cluster_copy, x='Frequency', y='CLV1', hue='Cluster_no', palette=cluster_palette, ax=axs[0, 1])
axs[0, 1].set_title('Frequency vs CLV1')
# Plotting Monetary vs CLV1
sb.scatterplot(data=df_cluster_copy, x='Monetary', y='CLV1', hue='Cluster_no', palette=cluster_palette, ax=axs[1, 0])
axs[1, 0].set_title('Monetary vs CLV1')
# Add centroids to plots
for cluster in np.unique(df_cluster_copy['Cluster_no']):
centroid = df_cluster_copy[df_cluster_copy['Cluster_no'] == cluster][['Recency', 'Frequency', 'Monetary', 'CLV1']].mean()
axs[0, 0].plot(centroid['Recency'], centroid['CLV1'], marker='o', markersize=8, markeredgecolor='black', label=f'Cluster {cluster}', linestyle='', color='red')
axs[0, 1].plot(centroid['Frequency'], centroid['CLV1'], marker='o', markersize=8, markeredgecolor='black', label=f'Cluster {cluster}', linestyle='', color='red')
axs[1, 0].plot(centroid['Monetary'], centroid['CLV1'], marker='o', markersize=8, markeredgecolor='black', label=f'Cluster {cluster}', linestyle='', color='red')
# Hide empty subplot
axs[1, 1].axis('off')
# Adjust layout
plt.tight_layout()
# Show the plot
plt.show()
# Reset the index of agg_df to turn 'CustomerID' into a column
agg_df = agg_df.reset_index()
# Consider 'CLV1' is a unique key in both DataFrames, we then combine grouped_profile and cluster dataset df_cluster_copy
merged_profile = pd.merge(agg_df[['CustomerID','Gender', 'Location', 'Quantity', 'Tenure', 'AOV', 'CLV1', 'Product Category', 'Coupon Status']], df_cluster_copy.drop(columns='clusters'), on='CLV1', how='left')
merged_profile
| CustomerID | Gender | Location | Quantity | Tenure | AOV | CLV1 | Product Category | Coupon Status | Recency | Frequency | Monetary | Cluster_no | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 12346 | F | New York | 3 | 31 | 87.490870 | 138.284186 | Apparel | Used | 107 | 2 | 174.98174 | 0 |
| 1 | 12347 | M | New York | 342 | 20 | 261.447399 | 413.232155 | Nest-USA | Clicked | 59 | 60 | 15686.84396 | 2 |
| 2 | 12348 | M | California | 209 | 39 | 73.458954 | 116.105962 | Office | Clicked | 73 | 23 | 1689.55594 | 0 |
| 3 | 12350 | M | California | 21 | 25 | 86.319722 | 136.433122 | Apparel | Clicked | 17 | 17 | 1467.43528 | 0 |
| 4 | 12356 | F | Chicago | 56 | 31 | 55.761917 | 88.134810 | Apparel | Clicked | 107 | 36 | 2007.42900 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1469 | 18260 | M | New York | 141 | 43 | 75.413593 | 119.195378 | Apparel | Clicked | 87 | 40 | 3016.54372 | 0 |
| 1470 | 18269 | M | Chicago | 10 | 25 | 20.863437 | 32.975823 | Apparel | Clicked | 194 | 8 | 166.90750 | 0 |
| 1471 | 18277 | F | Chicago | 2 | 47 | 333.800000 | 527.589464 | Nest-USA | Not Used | 69 | 1 | 333.80000 | 0 |
| 1472 | 18277 | F | Chicago | 2 | 47 | 333.800000 | 527.589464 | Nest-USA | Not Used | 223 | 1 | 333.80000 | 0 |
| 1473 | 18283 | F | New Jersey | 155 | 36 | 72.125994 | 113.999145 | Apparel | Clicked | 82 | 102 | 7356.85141 | 2 |
1474 rows × 13 columns
# Break up of the number of observations in each cluster
cluster_counts = merged_profile.Cluster_no.value_counts()
cluster_counts
0 1233 2 234 1 7 Name: Cluster_no, dtype: int64
# Segment Distribtuion % wise:
cluster_percentages = (cluster_counts / merged_profile['Cluster_no'].size) * 100
cluster_percentages
0 83.649932 2 15.875170 1 0.474898 Name: Cluster_no, dtype: float64
# For each cluster, calculate statistics for each attribute to profile the clusters.
# For numerical attributes, use mean.
# For categorical attributes, count the frequency of each category.
# Set seed for reproducibility
np.random.seed(3)
# Profiling for numerical attributes
numerical_profile = merged_profile.groupby('Cluster_no').agg({
'Quantity': 'mean',
'Tenure': 'mean',
'AOV': 'mean',
'Recency': 'mean',
'Frequency': 'mean',
'Monetary': 'mean',
'CLV1':'mean'
})
# Profiling for categorical attributes
categorical_profile = merged_profile.groupby('Cluster_no').agg({
'Location': lambda x: x.mode()[0],
'Gender': lambda x: x.mode()[0],
'Product Category': lambda x: x.mode()[0],
'Coupon Status': lambda x: x.mode()[0]
})
# Combine the numerical and categorical profiles into one DataFrame
complete_profile = numerical_profile.join(categorical_profile).reset_index()
# Displaying the complete profile for each cluster
print(complete_profile)
Cluster_no Quantity Tenure AOV Recency Frequency \
0 0 86.865369 25.845904 96.998251 151.868613 21.458232
1 1 2806.571429 30.285714 125.516719 71.571429 501.285714
2 2 475.606838 26.145299 123.204172 107.709402 98.132479
Monetary CLV1 Location Gender Product Category Coupon Status
0 1971.506761 153.311130 California F Apparel Clicked
1 61713.066070 198.386155 Chicago F Apparel Clicked
2 10828.387101 194.731047 California F Apparel Clicked
merged_profile
| CustomerID | Gender | Location | Quantity | Tenure | AOV | CLV1 | Product Category | Coupon Status | Recency | Frequency | Monetary | Cluster_no | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 12346 | F | New York | 3 | 31 | 87.490870 | 138.284186 | Apparel | Used | 107 | 2 | 174.98174 | 0 |
| 1 | 12347 | M | New York | 342 | 20 | 261.447399 | 413.232155 | Nest-USA | Clicked | 59 | 60 | 15686.84396 | 2 |
| 2 | 12348 | M | California | 209 | 39 | 73.458954 | 116.105962 | Office | Clicked | 73 | 23 | 1689.55594 | 0 |
| 3 | 12350 | M | California | 21 | 25 | 86.319722 | 136.433122 | Apparel | Clicked | 17 | 17 | 1467.43528 | 0 |
| 4 | 12356 | F | Chicago | 56 | 31 | 55.761917 | 88.134810 | Apparel | Clicked | 107 | 36 | 2007.42900 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1469 | 18260 | M | New York | 141 | 43 | 75.413593 | 119.195378 | Apparel | Clicked | 87 | 40 | 3016.54372 | 0 |
| 1470 | 18269 | M | Chicago | 10 | 25 | 20.863437 | 32.975823 | Apparel | Clicked | 194 | 8 | 166.90750 | 0 |
| 1471 | 18277 | F | Chicago | 2 | 47 | 333.800000 | 527.589464 | Nest-USA | Not Used | 69 | 1 | 333.80000 | 0 |
| 1472 | 18277 | F | Chicago | 2 | 47 | 333.800000 | 527.589464 | Nest-USA | Not Used | 223 | 1 | 333.80000 | 0 |
| 1473 | 18283 | F | New Jersey | 155 | 36 | 72.125994 | 113.999145 | Apparel | Clicked | 82 | 102 | 7356.85141 | 2 |
1474 rows × 13 columns
Customers here perhaps making fewer purchases than cluster 2. However, their purchase value per visit us high just as much as premium customers. And even though their last interaction isn't the most recent, they have the high potential future value.
This cluster is the least engaged with the brand. They shop infrequently, spend the least amount of money, and have not made purchases recently, hinting at a risk of churn. The low AOV indicates that customers in this group are purchasing less expensive items or smaller quantities. Their CLV suggests limited long-term profitability.
This is the most loyal segment with customers that spend a lot and shop most frequently, have recently interacted with the brand, and have a long-standing relationship, thus considered 'Premium'.
# Define a dependent variable
# Define labels based on cluster characteristics
def label_clusters(cluster):
if cluster == 1:
return 'Premium'
elif cluster == 2:
return 'Gold'
else:
return 'Silver'
# Apply the label_clusters function to assign labels
merged_profile['Segment_Label'] = merged_profile['Cluster_no'].apply(label_clusters)
merged_profile.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1474 entries, 0 to 1473 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 1474 non-null int64 1 Gender 1474 non-null object 2 Location 1474 non-null object 3 Quantity 1474 non-null int64 4 Tenure 1474 non-null int64 5 AOV 1474 non-null float64 6 CLV1 1474 non-null float64 7 Product Category 1474 non-null object 8 Coupon Status 1474 non-null object 9 Recency 1474 non-null int64 10 Frequency 1474 non-null int64 11 Monetary 1474 non-null float64 12 Cluster_no 1474 non-null int32 13 Segment_Label 1474 non-null object dtypes: float64(3), int32(1), int64(5), object(5) memory usage: 167.0+ KB
# Split the data into features and target
feature_cols = ['Quantity', 'Gender','Location','Tenure', 'AOV']
X = merged_profile[feature_cols]
X = pd.get_dummies(X, columns=['Gender', 'Location'])
y = merged_profile['Segment_Label']
# Scale the features
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Oversampling the training set
from imblearn.over_sampling import SMOTE
#smote = SMOTE(k_neighbors=3, random_state=42)
# Resample the data
#X_resampled, y_resampled = smote.fit_resample(X, y)
from imblearn.combine import SMOTETomek
# SMOTE-Tomek
smote_tomek = SMOTETomek(sampling_strategy='auto', random_state=42, smote=SMOTE(k_neighbors=3))
# Resample the data
X_resampled, y_resampled = smote_tomek.fit_resample(X, y)
# Check the class after oversampling
unique, counts = np.unique(y_resampled, return_counts=True)
print(dict(zip(unique, counts)))
{'Gold': 1211, 'Premium': 1233, 'Silver': 1211}
# Splitting the dataset into training (70%) and testing (30%)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
# Create the model, using the 'lbfgs' solver for multinomial logistic regression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
# Fit the model to the training data
y_score = model.fit(X_train, y_train).predict_proba(X_test)
# Predict the labels for the test set
y_pred = model.predict(X_test)
# Calculate and print the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
Accuracy: 0.89
y_pred_train = model.predict(X_train)
# Accuracy of training set
accuracy_train = accuracy_score(y_train, y_pred_train)
print(f'Accuracy: {accuracy_train:.2f}')
Accuracy: 0.90
from sklearn.model_selection import learning_curve
train_sizes, train_scores, validation_scores = learning_curve(
estimator = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000),
X = X, y = y, train_sizes = np.linspace(0.01, 1.0, 50), cv = 5
)
# Mean and standard deviation of training and validation scores
train_scores_mean = np.mean(train_scores, axis=1)
validation_scores_mean = np.mean(validation_scores, axis=1)
# Plot learning curves
plt.figure()
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, validation_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.title("Learning Curve")
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy Score")
plt.legend(loc="best")
plt.show()
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
ConfusionMatrixDisplay(confusion_matrix=conf_matrix).plot()
# Classification report
classif_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(classif_report)
Confusion Matrix:
[[291 11 57]
[ 0 348 0]
[ 56 0 334]]
Classification Report:
precision recall f1-score support
Gold 0.84 0.81 0.82 359
Premium 0.97 1.00 0.98 348
Silver 0.85 0.86 0.86 390
accuracy 0.89 1097
macro avg 0.89 0.89 0.89 1097
weighted avg 0.89 0.89 0.89 1097
# Create Decision Tree classifer object
dt = DecisionTreeClassifier()
# Train Decision Tree Classifer
dt = dt.fit(X_train,y_train)
#Predict the response for test dataset
y_pred_dt = dt.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_dt))
Accuracy: 0.9197812215132178
from sklearn.ensemble import RandomForestClassifier
# Initialize the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
# Fit the classifier to the training data
rf.fit(X_train, y_train)
# Predict on the test data
y_pred = rf.predict(X_test)
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classif_report = classification_report(y_test, y_pred)
# Print the results
print(f"Random Forest Model Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classif_report)
Random Forest Model Accuracy: 0.95
Confusion Matrix:
[[354 0 5]
[ 2 346 0]
[ 51 0 339]]
Classification Report:
precision recall f1-score support
Gold 0.87 0.99 0.92 359
Premium 1.00 0.99 1.00 348
Silver 0.99 0.87 0.92 390
accuracy 0.95 1097
macro avg 0.95 0.95 0.95 1097
weighted avg 0.95 0.95 0.95 1097
# Encode string class labels to integers
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
# Fit the XGBoost model using the encoded labels
xgb_classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_classifier.fit(X_train, y_train_encoded)
# Predict on the test data
y_pred_encoded = xgb_classifier.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded) # Convert predictions back to original labels
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classif_report = classification_report(y_test, y_pred)
# Print the results
print(f"XGBoost Model Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classif_report)
XGBoost Model Accuracy: 0.94
Confusion Matrix:
[[344 2 13]
[ 2 346 0]
[ 49 0 341]]
Classification Report:
precision recall f1-score support
Gold 0.87 0.96 0.91 359
Premium 0.99 0.99 0.99 348
Silver 0.96 0.87 0.92 390
accuracy 0.94 1097
macro avg 0.94 0.94 0.94 1097
weighted avg 0.94 0.94 0.94 1097
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
basket = df.groupby(['Transaction_ID', 'Product_Category'])['Quantity'].sum().unstack().fillna(0)
basket = basket > 0 # This converts data directly to boolean
# Apply Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(basket, min_support=0.03, use_colnames=True)
# Extract association rules
association_rules_df = association_rules(frequent_itemsets, metric='lift', min_threshold=0.5)
# Interpret the association rules and identify products to bundle
# For example, to identify items that are frequently purchased together:
frequent_itemsets['itemsets'].apply(lambda x: list(x))
0 [Apparel] 1 [Bags] 2 [Drinkware] 3 [Lifestyle] 4 [Nest] 5 [Nest-USA] 6 [Office] 7 [Drinkware, Apparel] 8 [Lifestyle, Apparel] 9 [Apparel, Office] 10 [Drinkware, Office] 11 [Lifestyle, Office] Name: itemsets, dtype: object
association_rules_df
| antecedents | consequents | antecedent support | consequent support | support | confidence | lift | leverage | conviction | zhangs_metric | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | (Drinkware) | (Apparel) | 0.100714 | 0.324369 | 0.045010 | 0.446910 | 1.377784 | 0.012342 | 1.221557 | 0.304905 |
| 1 | (Apparel) | (Drinkware) | 0.324369 | 0.100714 | 0.045010 | 0.138762 | 1.377784 | 0.012342 | 1.044179 | 0.405838 |
| 2 | (Lifestyle) | (Apparel) | 0.068313 | 0.324369 | 0.033079 | 0.484229 | 1.492836 | 0.010921 | 1.309945 | 0.354340 |
| 3 | (Apparel) | (Lifestyle) | 0.324369 | 0.068313 | 0.033079 | 0.101981 | 1.492836 | 0.010921 | 1.037491 | 0.488630 |
| 4 | (Apparel) | (Office) | 0.324369 | 0.140697 | 0.062128 | 0.191536 | 1.361343 | 0.016491 | 1.062884 | 0.392864 |
| 5 | (Office) | (Apparel) | 0.140697 | 0.324369 | 0.062128 | 0.441577 | 1.361343 | 0.016491 | 1.209892 | 0.308891 |
| 6 | (Drinkware) | (Office) | 0.100714 | 0.140697 | 0.046287 | 0.459588 | 3.266516 | 0.032117 | 1.590089 | 0.771572 |
| 7 | (Office) | (Drinkware) | 0.140697 | 0.100714 | 0.046287 | 0.328985 | 3.266516 | 0.032117 | 1.340187 | 0.807472 |
| 8 | (Lifestyle) | (Office) | 0.068313 | 0.140697 | 0.035114 | 0.514019 | 3.653381 | 0.025503 | 1.768182 | 0.779533 |
| 9 | (Office) | (Lifestyle) | 0.140697 | 0.068313 | 0.035114 | 0.249575 | 3.653381 | 0.025503 | 1.241545 | 0.845197 |
# Example mapping
product_transaction_mapping = df.reset_index().groupby('Product_Category')['Transaction_ID'].unique()
# Define function to find transactions for complex itemsets
def find_transactions_for_itemset(itemset):
itemset = list(itemset)
if not itemset:
return []
possible_transactions = set(product_transaction_mapping[itemset[0]])
for item in itemset[1:]:
if item in product_transaction_mapping:
possible_transactions.intersection_update(product_transaction_mapping[item])
else:
return []
return list(possible_transactions)
# Calculate the transactions associated with each antecedent itemset in the DataFrame and stores this information in a new column called 'Transactions'
association_rules_df['Transactions'] = association_rules_df['antecedents'].apply(find_transactions_for_itemset)
# Define function to find CustomerIDs for these transactions
def find_customers_for_transactions(transactions):
return df[df['Transaction_ID'].isin(transactions)]['CustomerID'].unique()
association_rules_df['CustomerIDs'] = association_rules_df['Transactions'].apply(find_customers_for_transactions)
# Include full itemset details when merging with customer profiles
def merge_with_profile(customer_ids, itemset):
profiles = merged_profile[merged_profile['CustomerID'].isin(customer_ids)]
profiles['Itemset'] = [', '.join(itemset)] * len(profiles) # Convert itemset to a comma-separated string
return profiles
# Usage example to get profiles for a specific rule, using both antecedents and consequents
for index, row in association_rules_df.iterrows():
itemset_example = list(row['antecedents']) + list(row['consequents']) # Combine both antecedents and consequents
profiles_linked_to_rule = merge_with_profile(row['CustomerIDs'], itemset_example)
print(f"Profiles linked to itemset {itemset_example}:")
print(profiles_linked_to_rule)
Profiles linked to itemset ['Drinkware', 'Apparel']:
CustomerID Gender Location Quantity Tenure AOV \
1 12347 M New York 342 20 261.447399
2 12348 M California 209 39 73.458954
4 12356 F Chicago 56 31 55.761917
6 12370 F New York 613 21 88.515242
7 12373 F New York 22 23 42.540433
... ... ... ... ... ... ...
1464 18239 M New York 140 20 133.231426
1465 18245 F Washington DC 86 5 153.096153
1468 18259 F California 46 5 122.955143
1469 18260 M New York 141 43 75.413593
1473 18283 F New Jersey 155 36 72.125994
CLV1 Product Category Coupon Status Recency Frequency \
1 413.232155 Nest-USA Clicked 59 60
2 116.105962 Office Clicked 73 23
4 88.134810 Apparel Clicked 107 36
6 139.903263 Apparel Clicked 189 91
7 67.237520 Apparel Clicked 24 14
... ... ... ... ... ...
1464 210.579679 Apparel Clicked 34 52
1465 241.976985 Nest-USA Clicked 33 55
1468 194.337441 Apparel Clicked 270 7
1469 119.195378 Apparel Clicked 87 40
1473 113.999145 Apparel Clicked 82 102
Monetary Cluster_no Segment_Label Itemset
1 15686.84396 2 Gold Drinkware, Apparel
2 1689.55594 0 Silver Drinkware, Apparel
4 2007.42900 0 Silver Drinkware, Apparel
6 8054.88702 2 Gold Drinkware, Apparel
7 595.56606 0 Silver Drinkware, Apparel
... ... ... ... ...
1464 6928.03416 2 Gold Drinkware, Apparel
1465 8420.28844 2 Gold Drinkware, Apparel
1468 860.68600 0 Silver Drinkware, Apparel
1469 3016.54372 0 Silver Drinkware, Apparel
1473 7356.85141 2 Gold Drinkware, Apparel
[1031 rows x 15 columns]
Profiles linked to itemset ['Apparel', 'Drinkware']:
CustomerID Gender Location Quantity Tenure AOV CLV1 \
0 12346 F New York 3 31 87.490870 138.284186
1 12347 M New York 342 20 261.447399 413.232155
2 12348 M California 209 39 73.458954 116.105962
3 12350 M California 21 25 86.319722 136.433122
4 12356 F Chicago 56 31 55.761917 88.134810
... ... ... ... ... ... ... ...
1467 18256 M California 4 45 20.838400 32.936250
1468 18259 F California 46 5 122.955143 194.337441
1469 18260 M New York 141 43 75.413593 119.195378
1470 18269 M Chicago 10 25 20.863437 32.975823
1473 18283 F New Jersey 155 36 72.125994 113.999145
Product Category Coupon Status Recency Frequency Monetary \
0 Apparel Used 107 2 174.98174
1 Nest-USA Clicked 59 60 15686.84396
2 Office Clicked 73 23 1689.55594
3 Apparel Clicked 17 17 1467.43528
4 Apparel Clicked 107 36 2007.42900
... ... ... ... ... ...
1467 Apparel Clicked 179 4 83.35360
1468 Apparel Clicked 270 7 860.68600
1469 Apparel Clicked 87 40 3016.54372
1470 Apparel Clicked 194 8 166.90750
1473 Apparel Clicked 82 102 7356.85141
Cluster_no Segment_Label Itemset
0 0 Silver Apparel, Drinkware
1 2 Gold Apparel, Drinkware
2 0 Silver Apparel, Drinkware
3 0 Silver Apparel, Drinkware
4 0 Silver Apparel, Drinkware
... ... ... ...
1467 0 Silver Apparel, Drinkware
1468 0 Silver Apparel, Drinkware
1469 0 Silver Apparel, Drinkware
1470 0 Silver Apparel, Drinkware
1473 2 Gold Apparel, Drinkware
[1367 rows x 15 columns]
Profiles linked to itemset ['Lifestyle', 'Apparel']:
CustomerID Gender Location Quantity Tenure AOV CLV1 \
1 12347 M New York 342 20 261.447399 413.232155
2 12348 M California 209 39 73.458954 116.105962
4 12356 F Chicago 56 31 55.761917 88.134810
6 12370 F New York 613 21 88.515242 139.903263
8 12377 F California 421 27 146.037671 230.820661
... ... ... ... ... ... ... ...
1463 18233 F New Jersey 50 2 82.615000 130.577602
1464 18239 M New York 140 20 133.231426 210.579679
1468 18259 F California 46 5 122.955143 194.337441
1469 18260 M New York 141 43 75.413593 119.195378
1473 18283 F New Jersey 155 36 72.125994 113.999145
Product Category Coupon Status Recency Frequency Monetary \
1 Nest-USA Clicked 59 60 15686.84396
2 Office Clicked 73 23 1689.55594
4 Apparel Clicked 107 36 2007.42900
6 Apparel Clicked 189 91 8054.88702
8 Apparel Clicked 40 77 11244.90066
... ... ... ... ... ...
1463 Office Used 108 1 82.61500
1464 Apparel Clicked 34 52 6928.03416
1468 Apparel Clicked 270 7 860.68600
1469 Apparel Clicked 87 40 3016.54372
1473 Apparel Clicked 82 102 7356.85141
Cluster_no Segment_Label Itemset
1 2 Gold Lifestyle, Apparel
2 0 Silver Lifestyle, Apparel
4 0 Silver Lifestyle, Apparel
6 2 Gold Lifestyle, Apparel
8 2 Gold Lifestyle, Apparel
... ... ... ...
1463 0 Silver Lifestyle, Apparel
1464 2 Gold Lifestyle, Apparel
1468 0 Silver Lifestyle, Apparel
1469 0 Silver Lifestyle, Apparel
1473 2 Gold Lifestyle, Apparel
[932 rows x 15 columns]
Profiles linked to itemset ['Apparel', 'Lifestyle']:
CustomerID Gender Location Quantity Tenure AOV CLV1 \
0 12346 F New York 3 31 87.490870 138.284186
1 12347 M New York 342 20 261.447399 413.232155
2 12348 M California 209 39 73.458954 116.105962
3 12350 M California 21 25 86.319722 136.433122
4 12356 F Chicago 56 31 55.761917 88.134810
... ... ... ... ... ... ... ...
1467 18256 M California 4 45 20.838400 32.936250
1468 18259 F California 46 5 122.955143 194.337441
1469 18260 M New York 141 43 75.413593 119.195378
1470 18269 M Chicago 10 25 20.863437 32.975823
1473 18283 F New Jersey 155 36 72.125994 113.999145
Product Category Coupon Status Recency Frequency Monetary \
0 Apparel Used 107 2 174.98174
1 Nest-USA Clicked 59 60 15686.84396
2 Office Clicked 73 23 1689.55594
3 Apparel Clicked 17 17 1467.43528
4 Apparel Clicked 107 36 2007.42900
... ... ... ... ... ...
1467 Apparel Clicked 179 4 83.35360
1468 Apparel Clicked 270 7 860.68600
1469 Apparel Clicked 87 40 3016.54372
1470 Apparel Clicked 194 8 166.90750
1473 Apparel Clicked 82 102 7356.85141
Cluster_no Segment_Label Itemset
0 0 Silver Apparel, Lifestyle
1 2 Gold Apparel, Lifestyle
2 0 Silver Apparel, Lifestyle
3 0 Silver Apparel, Lifestyle
4 0 Silver Apparel, Lifestyle
... ... ... ...
1467 0 Silver Apparel, Lifestyle
1468 0 Silver Apparel, Lifestyle
1469 0 Silver Apparel, Lifestyle
1470 0 Silver Apparel, Lifestyle
1473 2 Gold Apparel, Lifestyle
[1367 rows x 15 columns]
Profiles linked to itemset ['Apparel', 'Office']:
CustomerID Gender Location Quantity Tenure AOV CLV1 \
0 12346 F New York 3 31 87.490870 138.284186
1 12347 M New York 342 20 261.447399 413.232155
2 12348 M California 209 39 73.458954 116.105962
3 12350 M California 21 25 86.319722 136.433122
4 12356 F Chicago 56 31 55.761917 88.134810
... ... ... ... ... ... ... ...
1467 18256 M California 4 45 20.838400 32.936250
1468 18259 F California 46 5 122.955143 194.337441
1469 18260 M New York 141 43 75.413593 119.195378
1470 18269 M Chicago 10 25 20.863437 32.975823
1473 18283 F New Jersey 155 36 72.125994 113.999145
Product Category Coupon Status Recency Frequency Monetary \
0 Apparel Used 107 2 174.98174
1 Nest-USA Clicked 59 60 15686.84396
2 Office Clicked 73 23 1689.55594
3 Apparel Clicked 17 17 1467.43528
4 Apparel Clicked 107 36 2007.42900
... ... ... ... ... ...
1467 Apparel Clicked 179 4 83.35360
1468 Apparel Clicked 270 7 860.68600
1469 Apparel Clicked 87 40 3016.54372
1470 Apparel Clicked 194 8 166.90750
1473 Apparel Clicked 82 102 7356.85141
Cluster_no Segment_Label Itemset
0 0 Silver Apparel, Office
1 2 Gold Apparel, Office
2 0 Silver Apparel, Office
3 0 Silver Apparel, Office
4 0 Silver Apparel, Office
... ... ... ...
1467 0 Silver Apparel, Office
1468 0 Silver Apparel, Office
1469 0 Silver Apparel, Office
1470 0 Silver Apparel, Office
1473 2 Gold Apparel, Office
[1367 rows x 15 columns]
Profiles linked to itemset ['Office', 'Apparel']:
CustomerID Gender Location Quantity Tenure AOV CLV1 \
0 12346 F New York 3 31 87.490870 138.284186
1 12347 M New York 342 20 261.447399 413.232155
2 12348 M California 209 39 73.458954 116.105962
4 12356 F Chicago 56 31 55.761917 88.134810
5 12359 M New York 86 41 49.002075 77.450505
... ... ... ... ... ... ... ...
1469 18260 M New York 141 43 75.413593 119.195378
1470 18269 M Chicago 10 25 20.863437 32.975823
1471 18277 F Chicago 2 47 333.800000 527.589464
1472 18277 F Chicago 2 47 333.800000 527.589464
1473 18283 F New Jersey 155 36 72.125994 113.999145
Product Category Coupon Status Recency Frequency Monetary \
0 Apparel Used 107 2 174.98174
1 Nest-USA Clicked 59 60 15686.84396
2 Office Clicked 73 23 1689.55594
4 Apparel Clicked 107 36 2007.42900
5 Apparel Clicked 130 16 784.03320
... ... ... ... ... ...
1469 Apparel Clicked 87 40 3016.54372
1470 Apparel Clicked 194 8 166.90750
1471 Nest-USA Not Used 69 1 333.80000
1472 Nest-USA Not Used 223 1 333.80000
1473 Apparel Clicked 82 102 7356.85141
Cluster_no Segment_Label Itemset
0 0 Silver Office, Apparel
1 2 Gold Office, Apparel
2 0 Silver Office, Apparel
4 0 Silver Office, Apparel
5 0 Silver Office, Apparel
... ... ... ...
1469 0 Silver Office, Apparel
1470 0 Silver Office, Apparel
1471 0 Silver Office, Apparel
1472 0 Silver Office, Apparel
1473 2 Gold Office, Apparel
[1203 rows x 15 columns]
Profiles linked to itemset ['Drinkware', 'Office']:
CustomerID Gender Location Quantity Tenure AOV \
1 12347 M New York 342 20 261.447399
2 12348 M California 209 39 73.458954
4 12356 F Chicago 56 31 55.761917
6 12370 F New York 613 21 88.515242
7 12373 F New York 22 23 42.540433
... ... ... ... ... ... ...
1464 18239 M New York 140 20 133.231426
1465 18245 F Washington DC 86 5 153.096153
1468 18259 F California 46 5 122.955143
1469 18260 M New York 141 43 75.413593
1473 18283 F New Jersey 155 36 72.125994
CLV1 Product Category Coupon Status Recency Frequency \
1 413.232155 Nest-USA Clicked 59 60
2 116.105962 Office Clicked 73 23
4 88.134810 Apparel Clicked 107 36
6 139.903263 Apparel Clicked 189 91
7 67.237520 Apparel Clicked 24 14
... ... ... ... ... ...
1464 210.579679 Apparel Clicked 34 52
1465 241.976985 Nest-USA Clicked 33 55
1468 194.337441 Apparel Clicked 270 7
1469 119.195378 Apparel Clicked 87 40
1473 113.999145 Apparel Clicked 82 102
Monetary Cluster_no Segment_Label Itemset
1 15686.84396 2 Gold Drinkware, Office
2 1689.55594 0 Silver Drinkware, Office
4 2007.42900 0 Silver Drinkware, Office
6 8054.88702 2 Gold Drinkware, Office
7 595.56606 0 Silver Drinkware, Office
... ... ... ... ...
1464 6928.03416 2 Gold Drinkware, Office
1465 8420.28844 2 Gold Drinkware, Office
1468 860.68600 0 Silver Drinkware, Office
1469 3016.54372 0 Silver Drinkware, Office
1473 7356.85141 2 Gold Drinkware, Office
[1031 rows x 15 columns]
Profiles linked to itemset ['Office', 'Drinkware']:
CustomerID Gender Location Quantity Tenure AOV CLV1 \
0 12346 F New York 3 31 87.490870 138.284186
1 12347 M New York 342 20 261.447399 413.232155
2 12348 M California 209 39 73.458954 116.105962
4 12356 F Chicago 56 31 55.761917 88.134810
5 12359 M New York 86 41 49.002075 77.450505
... ... ... ... ... ... ... ...
1469 18260 M New York 141 43 75.413593 119.195378
1470 18269 M Chicago 10 25 20.863437 32.975823
1471 18277 F Chicago 2 47 333.800000 527.589464
1472 18277 F Chicago 2 47 333.800000 527.589464
1473 18283 F New Jersey 155 36 72.125994 113.999145
Product Category Coupon Status Recency Frequency Monetary \
0 Apparel Used 107 2 174.98174
1 Nest-USA Clicked 59 60 15686.84396
2 Office Clicked 73 23 1689.55594
4 Apparel Clicked 107 36 2007.42900
5 Apparel Clicked 130 16 784.03320
... ... ... ... ... ...
1469 Apparel Clicked 87 40 3016.54372
1470 Apparel Clicked 194 8 166.90750
1471 Nest-USA Not Used 69 1 333.80000
1472 Nest-USA Not Used 223 1 333.80000
1473 Apparel Clicked 82 102 7356.85141
Cluster_no Segment_Label Itemset
0 0 Silver Office, Drinkware
1 2 Gold Office, Drinkware
2 0 Silver Office, Drinkware
4 0 Silver Office, Drinkware
5 0 Silver Office, Drinkware
... ... ... ...
1469 0 Silver Office, Drinkware
1470 0 Silver Office, Drinkware
1471 0 Silver Office, Drinkware
1472 0 Silver Office, Drinkware
1473 2 Gold Office, Drinkware
[1203 rows x 15 columns]
Profiles linked to itemset ['Lifestyle', 'Office']:
CustomerID Gender Location Quantity Tenure AOV CLV1 \
1 12347 M New York 342 20 261.447399 413.232155
2 12348 M California 209 39 73.458954 116.105962
4 12356 F Chicago 56 31 55.761917 88.134810
6 12370 F New York 613 21 88.515242 139.903263
8 12377 F California 421 27 146.037671 230.820661
... ... ... ... ... ... ... ...
1463 18233 F New Jersey 50 2 82.615000 130.577602
1464 18239 M New York 140 20 133.231426 210.579679
1468 18259 F California 46 5 122.955143 194.337441
1469 18260 M New York 141 43 75.413593 119.195378
1473 18283 F New Jersey 155 36 72.125994 113.999145
Product Category Coupon Status Recency Frequency Monetary \
1 Nest-USA Clicked 59 60 15686.84396
2 Office Clicked 73 23 1689.55594
4 Apparel Clicked 107 36 2007.42900
6 Apparel Clicked 189 91 8054.88702
8 Apparel Clicked 40 77 11244.90066
... ... ... ... ... ...
1463 Office Used 108 1 82.61500
1464 Apparel Clicked 34 52 6928.03416
1468 Apparel Clicked 270 7 860.68600
1469 Apparel Clicked 87 40 3016.54372
1473 Apparel Clicked 82 102 7356.85141
Cluster_no Segment_Label Itemset
1 2 Gold Lifestyle, Office
2 0 Silver Lifestyle, Office
4 0 Silver Lifestyle, Office
6 2 Gold Lifestyle, Office
8 2 Gold Lifestyle, Office
... ... ... ...
1463 0 Silver Lifestyle, Office
1464 2 Gold Lifestyle, Office
1468 0 Silver Lifestyle, Office
1469 0 Silver Lifestyle, Office
1473 2 Gold Lifestyle, Office
[932 rows x 15 columns]
Profiles linked to itemset ['Office', 'Lifestyle']:
CustomerID Gender Location Quantity Tenure AOV CLV1 \
0 12346 F New York 3 31 87.490870 138.284186
1 12347 M New York 342 20 261.447399 413.232155
2 12348 M California 209 39 73.458954 116.105962
4 12356 F Chicago 56 31 55.761917 88.134810
5 12359 M New York 86 41 49.002075 77.450505
... ... ... ... ... ... ... ...
1469 18260 M New York 141 43 75.413593 119.195378
1470 18269 M Chicago 10 25 20.863437 32.975823
1471 18277 F Chicago 2 47 333.800000 527.589464
1472 18277 F Chicago 2 47 333.800000 527.589464
1473 18283 F New Jersey 155 36 72.125994 113.999145
Product Category Coupon Status Recency Frequency Monetary \
0 Apparel Used 107 2 174.98174
1 Nest-USA Clicked 59 60 15686.84396
2 Office Clicked 73 23 1689.55594
4 Apparel Clicked 107 36 2007.42900
5 Apparel Clicked 130 16 784.03320
... ... ... ... ... ...
1469 Apparel Clicked 87 40 3016.54372
1470 Apparel Clicked 194 8 166.90750
1471 Nest-USA Not Used 69 1 333.80000
1472 Nest-USA Not Used 223 1 333.80000
1473 Apparel Clicked 82 102 7356.85141
Cluster_no Segment_Label Itemset
0 0 Silver Office, Lifestyle
1 2 Gold Office, Lifestyle
2 0 Silver Office, Lifestyle
4 0 Silver Office, Lifestyle
5 0 Silver Office, Lifestyle
... ... ... ...
1469 0 Silver Office, Lifestyle
1470 0 Silver Office, Lifestyle
1471 0 Silver Office, Lifestyle
1472 0 Silver Office, Lifestyle
1473 2 Gold Office, Lifestyle
[1203 rows x 15 columns]
from datetime import timedelta
# Function to find transactions for complex itemsets within the same day for the same customer
def find_transactions_for_itemset(itemset):
itemset = list(itemset) # Ensure itemset is a list
if not itemset:
return []
possible_transactions = set(product_transaction_mapping[itemset[0]])
for item in itemset[1:]:
if item in product_transaction_mapping:
possible_transactions.intersection_update(product_transaction_mapping[item])
else:
return [] # No transactions if any item is missing
# Check transactions within the same day for the same customer
same_day_transactions = []
for transaction in possible_transactions:
transaction_date = df.loc[df['Transaction_ID'] == transaction, 'Transaction_Date'].iloc[0]
customer_id = df.loc[df['Transaction_ID'] == transaction, 'CustomerID'].iloc[0]
same_day_transactions.extend(df[(df['CustomerID'] == customer_id) &
(df['Transaction_Date'] == transaction_date)]['Transaction_ID'].tolist())
return list(set(same_day_transactions)) # Remove duplicates and return list
# Calculate the transactions associated with each antecedent itemset in the DataFrame and stores this information in a new column called 'Transactions'
association_rules_df['Transactions'] = association_rules_df['antecedents'].apply(find_transactions_for_itemset)
# Find CustomerIDs for these transactions
def find_customers_for_transactions(transactions):
return df[df['Transaction_ID'].isin(transactions)]['CustomerID'].unique()
association_rules_df['CustomerIDs'] = association_rules_df['Transactions'].apply(find_customers_for_transactions)
# Include full itemset details when merging with customer profiles
def merge_with_profile(customer_ids, itemset):
profiles = merged_profile[merged_profile['CustomerID'].isin(customer_ids)]
profiles['Itemset'] = [', '.join(itemset)] * len(profiles) # Convert itemset to a comma-separated string
return profiles
def display_segment_itemset_associations():
results = pd.DataFrame() # Initialize an empty DataFrame to store results
for index, row in association_rules_df.iterrows():
itemset_example = list(row['antecedents']) + list(row['consequents']) # Combine both antecedents and consequents
profiles = merge_with_profile(row['CustomerIDs'], itemset_example)
segment_itemset_count = profiles.groupby(['Segment_Label', 'Itemset']).size().reset_index(name='Count')
results = pd.concat([results, segment_itemset_count], ignore_index=True)
# Sorting and displaying the results
results = results.sort_values(by=['Segment_Label', 'Count'], ascending=[True, False])
print(results)
# Example usage
display_segment_itemset_associations()
Segment_Label Itemset Count 3 Gold Apparel, Drinkware 234 9 Gold Apparel, Lifestyle 234 12 Gold Apparel, Office 234 0 Gold Drinkware, Apparel 233 18 Gold Drinkware, Office 233 15 Gold Office, Apparel 230 21 Gold Office, Drinkware 230 27 Gold Office, Lifestyle 230 6 Gold Lifestyle, Apparel 226 24 Gold Lifestyle, Office 226 1 Premium Drinkware, Apparel 7 4 Premium Apparel, Drinkware 7 7 Premium Lifestyle, Apparel 7 10 Premium Apparel, Lifestyle 7 13 Premium Apparel, Office 7 16 Premium Office, Apparel 7 19 Premium Drinkware, Office 7 22 Premium Office, Drinkware 7 25 Premium Lifestyle, Office 7 28 Premium Office, Lifestyle 7 5 Silver Apparel, Drinkware 1162 11 Silver Apparel, Lifestyle 1162 14 Silver Apparel, Office 1162 17 Silver Office, Apparel 1066 23 Silver Office, Drinkware 1066 29 Silver Office, Lifestyle 1066 2 Silver Drinkware, Apparel 964 20 Silver Drinkware, Office 964 8 Silver Lifestyle, Apparel 866 26 Silver Lifestyle, Office 866
from datetime import date
df['Transaction_Date'] = pd.to_datetime(df['Transaction_Date']).dt.date
df_first9m = df[(df.Transaction_Date >= date(2019,1,1)) & (df.Transaction_Date < date(2019,9,1))].reset_index(drop=True)
df_last3m = df[(df.Transaction_Date >= date(2019,9,1)) & (df.Transaction_Date < date(2019,12,31))].reset_index(drop=True)
# Extract unique customer ID
df_first9m_unique = pd.DataFrame(df_first9m['CustomerID'].unique())
df_first9m_unique.columns = ['CustomerID']
# Last purchase in the first 9 months
df_first9m_last_purchase = df_first9m.groupby('CustomerID').Transaction_Date.max().reset_index()
df_first9m_last_purchase.columns = ['CustomerID', 'LastPurchaseDate']
# First purchase from september
df_last3m_first_purchase = df_last3m.groupby('CustomerID').Transaction_Date.min().reset_index()
df_last3m_first_purchase.columns = ['CustomerID', 'FirstPurchaseDate']
# Merge
df_purchase_dates = pd.merge(df_first9m_last_purchase, df_last3m_first_purchase, on = 'CustomerID', how = 'left')
# Calculate next purchase date
df_purchase_dates['NextPurchaseDate'] = (df_purchase_dates['FirstPurchaseDate'] - df_purchase_dates['LastPurchaseDate']).dt.days
df_purchase_dates
| CustomerID | LastPurchaseDate | FirstPurchaseDate | NextPurchaseDate | |
|---|---|---|---|---|
| 0 | 12347 | 2019-03-24 | 2019-11-01 | 222.0 |
| 1 | 12348 | 2019-06-22 | 2019-10-19 | 119.0 |
| 2 | 12359 | 2019-08-23 | NaN | NaN |
| 3 | 12370 | 2019-06-25 | NaN | NaN |
| 4 | 12377 | 2019-07-05 | 2019-11-21 | 139.0 |
| ... | ... | ... | ... | ... |
| 1124 | 18256 | 2019-07-05 | NaN | NaN |
| 1125 | 18259 | 2019-04-05 | NaN | NaN |
| 1126 | 18260 | 2019-08-11 | 2019-10-05 | 55.0 |
| 1127 | 18269 | 2019-06-20 | NaN | NaN |
| 1128 | 18283 | 2019-07-29 | 2019-10-10 | 73.0 |
1129 rows × 4 columns
# Extract customer ID from data frame of the first 9 months
df_user = pd.DataFrame(df_first9m['CustomerID'].unique())
df_user.columns = ['CustomerID']
# Add in column NextPurchaseDate from df_purchase_dates dataframe
df_user = pd.merge(df_user, df_purchase_dates[['CustomerID', 'NextPurchaseDate']], on = 'CustomerID', how = 'left')
df_user = df_user.fillna(999)
df_user
| CustomerID | NextPurchaseDate | |
|---|---|---|
| 0 | 17850 | 999.0 |
| 1 | 13047 | 139.0 |
| 2 | 12583 | 999.0 |
| 3 | 15100 | 999.0 |
| 4 | 14688 | 255.0 |
| ... | ... | ... |
| 1124 | 12928 | 999.0 |
| 1125 | 15749 | 999.0 |
| 1126 | 14209 | 999.0 |
| 1127 | 12728 | 999.0 |
| 1128 | 15569 | 999.0 |
1129 rows × 2 columns
# Recency
df_max_purchase = df_first9m.groupby('CustomerID').Transaction_Date.max().reset_index()
df_max_purchase.columns = ['CustomerID', 'MaxPurchaseDate']
df_max_purchase['Recency'] = (df_max_purchase['MaxPurchaseDate'].max() - df_max_purchase['MaxPurchaseDate']).dt.days
df_user = pd.merge(df_user, df_max_purchase[['CustomerID', 'Recency']], on = 'CustomerID')
df_user.head()
| CustomerID | NextPurchaseDate | Recency | |
|---|---|---|---|
| 0 | 17850 | 999.0 | 217 |
| 1 | 13047 | 139.0 | 30 |
| 2 | 12583 | 999.0 | 29 |
| 3 | 15100 | 999.0 | 1 |
| 4 | 14688 | 255.0 | 241 |
df_user.Recency.describe()
count 1129.000000 mean 91.518158 std 70.067431 min 0.000000 25% 30.000000 50% 78.000000 75% 143.000000 max 242.000000 Name: Recency, dtype: float64
# elbow curve
sse = {}
df_recency = df_user[['Recency']].copy() # Create a copy of the DataFrame
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_recency)
df_recency["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()
kmeans = KMeans(n_clusters=4)
kmeans.fit(df_user[['Recency']])
df_user['RecencyScore'] = kmeans.predict(df_user[['Recency']])
def order_cluster(cluster_field_name, target_field_name, df, ascending):
new_cluster_field_name = "new_" + cluster_field_name
df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
df_new = df_new.sort_values(by=target_field_name, ascending = ascending).reset_index(drop=True)
df_new['index'] = df_new.index
df_final = pd.merge(df, df_new[[cluster_field_name, 'index']], on = cluster_field_name)
df_final = df_final.drop([cluster_field_name], axis = 1)
df_final = df_final.rename(columns=({"index":cluster_field_name}))
return df_final
df_user = order_cluster('RecencyScore','Recency', df_user, False)
df_user
| CustomerID | NextPurchaseDate | Recency | RecencyScore | |
|---|---|---|---|---|
| 0 | 17850 | 999.0 | 217 | 0 |
| 1 | 14688 | 255.0 | 241 | 0 |
| 2 | 13748 | 999.0 | 242 | 0 |
| 3 | 17548 | 999.0 | 240 | 0 |
| 4 | 13705 | 999.0 | 240 | 0 |
| ... | ... | ... | ... | ... |
| 1124 | 14082 | 999.0 | 128 | 1 |
| 1125 | 16367 | 999.0 | 117 | 1 |
| 1126 | 17917 | 999.0 | 116 | 1 |
| 1127 | 12875 | 999.0 | 117 | 1 |
| 1128 | 16010 | 999.0 | 116 | 1 |
1129 rows × 4 columns
df_user.groupby('RecencyScore')['Recency'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| RecencyScore | ||||||||
| 0 | 152.0 | 217.184211 | 15.484845 | 184.0 | 210.0 | 218.0 | 230.0 | 242.0 |
| 1 | 238.0 | 148.096639 | 17.908202 | 116.0 | 134.0 | 146.0 | 163.0 | 182.0 |
| 2 | 297.0 | 83.047138 | 17.070139 | 54.0 | 70.0 | 81.0 | 99.0 | 115.0 |
| 3 | 442.0 | 23.529412 | 15.986113 | 0.0 | 9.0 | 22.0 | 37.0 | 53.0 |
# Frequency
df_frequency = df_first9m.groupby('CustomerID').Transaction_Date.count().reset_index()
df_frequency.columns = ['CustomerID', 'Frequency']
df_frequency.head()
| CustomerID | Frequency | |
|---|---|---|
| 0 | 12347 | 31 |
| 1 | 12348 | 17 |
| 2 | 12359 | 16 |
| 3 | 12370 | 91 |
| 4 | 12377 | 43 |
df_user = pd.merge(df_user, df_frequency, on = 'CustomerID')
df_user.head()
| CustomerID | NextPurchaseDate | Recency | RecencyScore | Frequency | |
|---|---|---|---|---|---|
| 0 | 17850 | 999.0 | 217 | 0 | 297 |
| 1 | 14688 | 255.0 | 241 | 0 | 19 |
| 2 | 13748 | 999.0 | 242 | 0 | 1 |
| 3 | 17548 | 999.0 | 240 | 0 | 7 |
| 4 | 13705 | 999.0 | 240 | 0 | 10 |
sse = {}
df_frequency = df_user[['Frequency']].copy() # Create a copy of the DataFrame
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_frequency)
df_recency["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()
kmeans = KMeans(n_clusters=4)
kmeans.fit(df_user[['Frequency']])
df_user['FrequencyScore'] = kmeans.predict(df_user[['Frequency']])
df_user.groupby('FrequencyScore')['Frequency'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| FrequencyScore | ||||||||
| 0 | 271.0 | 50.287823 | 13.859019 | 32.0 | 38.00 | 48.0 | 60.00 | 83.0 |
| 1 | 82.0 | 117.329268 | 30.096575 | 84.0 | 92.25 | 112.0 | 136.25 | 210.0 |
| 2 | 769.0 | 13.085826 | 8.599626 | 1.0 | 5.00 | 13.0 | 19.00 | 31.0 |
| 3 | 7.0 | 385.285714 | 141.656526 | 261.0 | 309.50 | 346.0 | 393.50 | 684.0 |
df_user = order_cluster('FrequencyScore','Frequency', df_user, True)
df_user
| CustomerID | NextPurchaseDate | Recency | RecencyScore | Frequency | FrequencyScore | |
|---|---|---|---|---|---|---|
| 0 | 17850 | 999.0 | 217 | 0 | 297 | 3 |
| 1 | 14606 | 18.0 | 7 | 3 | 360 | 3 |
| 2 | 15311 | 9.0 | 7 | 3 | 322 | 3 |
| 3 | 12748 | 41.0 | 36 | 3 | 684 | 3 |
| 4 | 14911 | 34.0 | 19 | 3 | 346 | 3 |
| ... | ... | ... | ... | ... | ... | ... |
| 1124 | 13174 | 254.0 | 180 | 1 | 169 | 2 |
| 1125 | 12433 | 999.0 | 143 | 1 | 147 | 2 |
| 1126 | 17827 | 999.0 | 127 | 1 | 112 | 2 |
| 1127 | 17259 | 999.0 | 173 | 1 | 87 | 2 |
| 1128 | 17341 | 999.0 | 150 | 1 | 161 | 2 |
1129 rows × 6 columns
# Monetary Value
df_monetary = df_first9m.groupby('CustomerID').Invoice.sum().reset_index()
df_monetary.columns = ['CustomerID', 'Monetary']
df_monetary.head()
| CustomerID | Monetary | |
|---|---|---|
| 0 | 12347 | 12901.28216 |
| 1 | 12348 | 878.21724 |
| 2 | 12359 | 784.03320 |
| 3 | 12370 | 8054.88702 |
| 4 | 12377 | 4219.86498 |
df_user = pd.merge(df_user, df_monetary, on = 'CustomerID')
df_user.head()
| CustomerID | NextPurchaseDate | Recency | RecencyScore | Frequency | FrequencyScore | Monetary | |
|---|---|---|---|---|---|---|---|
| 0 | 17850 | 999.0 | 217 | 0 | 297 | 3 | 40531.38476 |
| 1 | 14606 | 18.0 | 7 | 3 | 360 | 3 | 34610.78703 |
| 2 | 15311 | 9.0 | 7 | 3 | 322 | 3 | 62975.87976 |
| 3 | 12748 | 41.0 | 36 | 3 | 684 | 3 | 86143.00498 |
| 4 | 14911 | 34.0 | 19 | 3 | 346 | 3 | 38682.34453 |
sse = {}
df_monetary = df_user[['Monetary']].copy() # Create a copy of the DataFrame
for k in range(1, 10):
kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_monetary)
df_recency["clusters"] = kmeans.labels_
sse[k] = kmeans.inertia_
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()
kmeans = KMeans(n_clusters=4)
kmeans.fit(df_user[['Monetary']])
df_user['MonetaryScore'] = kmeans.predict(df_user[['Monetary']])
df_user = order_cluster('MonetaryScore','Monetary', df_user, True)
df_user
| CustomerID | NextPurchaseDate | Recency | RecencyScore | Frequency | FrequencyScore | Monetary | MonetaryScore | |
|---|---|---|---|---|---|---|---|---|
| 0 | 17850 | 999.0 | 217 | 0 | 297 | 3 | 40531.38476 | 2 |
| 1 | 14606 | 18.0 | 7 | 3 | 360 | 3 | 34610.78703 | 2 |
| 2 | 14911 | 34.0 | 19 | 3 | 346 | 3 | 38682.34453 | 2 |
| 3 | 17841 | 14.0 | 8 | 3 | 427 | 3 | 40768.85344 | 2 |
| 4 | 13081 | 999.0 | 0 | 3 | 261 | 3 | 23607.04965 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1124 | 15426 | 999.0 | 169 | 1 | 87 | 2 | 7671.12216 | 1 |
| 1125 | 13174 | 254.0 | 180 | 1 | 169 | 2 | 16049.48846 | 1 |
| 1126 | 17827 | 999.0 | 127 | 1 | 112 | 2 | 8747.50404 | 1 |
| 1127 | 17259 | 999.0 | 173 | 1 | 87 | 2 | 6587.70300 | 1 |
| 1128 | 17341 | 999.0 | 150 | 1 | 161 | 2 | 13437.68995 | 1 |
1129 rows × 8 columns
df_user.groupby('MonetaryScore')['Monetary'].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| MonetaryScore | ||||||||
| 0 | 917.0 | 1497.313441 | 1205.735670 | 7.32000 | 477.240000 | 1167.992100 | 2331.138900 | 4571.40105 |
| 1 | 196.0 | 7786.883740 | 2647.215751 | 4669.39420 | 5663.547540 | 7200.262005 | 8847.932535 | 16049.48846 |
| 2 | 14.0 | 26142.091861 | 8830.008688 | 17733.48544 | 19286.055310 | 22513.490325 | 33251.835132 | 40768.85344 |
| 3 | 2.0 | 74559.442370 | 16381.631344 | 62975.87976 | 68767.661065 | 74559.442370 | 80351.223675 | 86143.00498 |
# Overall segmentation
df_user['OverallScore']= df_user['RecencyScore'] + df_user['FrequencyScore'] + df_user['MonetaryScore']
df_user.groupby('OverallScore')[['Recency', 'Frequency', 'Monetary']].mean()
| Recency | Frequency | Monetary | |
|---|---|---|---|
| OverallScore | |||
| 0 | 219.421053 | 11.149123 | 1310.507012 |
| 1 | 150.454054 | 13.324324 | 1202.965619 |
| 2 | 101.170370 | 20.200000 | 2003.159898 |
| 3 | 44.126543 | 22.555556 | 1988.157034 |
| 4 | 40.902655 | 50.584071 | 4177.942595 |
| 5 | 47.283333 | 76.733333 | 8013.653154 |
| 6 | 23.222222 | 117.370370 | 10330.967894 |
| 7 | 24.333333 | 151.000000 | 19941.476807 |
| 8 | 8.500000 | 348.500000 | 34417.258662 |
| 9 | 21.500000 | 503.000000 | 74559.442370 |
df_user['Segment'] = 'Low-Value'
df_user.loc[df_user['OverallScore']>2, 'Segment']='Mid-Value'
df_user.loc[df_user['OverallScore']>5, 'Segment']='High-Value'
df_user
| CustomerID | NextPurchaseDate | Recency | RecencyScore | Frequency | FrequencyScore | Monetary | MonetaryScore | OverallScore | Segment | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 17850 | 999.0 | 217 | 0 | 297 | 3 | 40531.38476 | 2 | 5 | Mid-Value |
| 1 | 14606 | 18.0 | 7 | 3 | 360 | 3 | 34610.78703 | 2 | 8 | High-Value |
| 2 | 14911 | 34.0 | 19 | 3 | 346 | 3 | 38682.34453 | 2 | 8 | High-Value |
| 3 | 17841 | 14.0 | 8 | 3 | 427 | 3 | 40768.85344 | 2 | 8 | High-Value |
| 4 | 13081 | 999.0 | 0 | 3 | 261 | 3 | 23607.04965 | 2 | 8 | High-Value |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1124 | 15426 | 999.0 | 169 | 1 | 87 | 2 | 7671.12216 | 1 | 4 | Mid-Value |
| 1125 | 13174 | 254.0 | 180 | 1 | 169 | 2 | 16049.48846 | 1 | 4 | Mid-Value |
| 1126 | 17827 | 999.0 | 127 | 1 | 112 | 2 | 8747.50404 | 1 | 4 | Mid-Value |
| 1127 | 17259 | 999.0 | 173 | 1 | 87 | 2 | 6587.70300 | 1 | 4 | Mid-Value |
| 1128 | 17341 | 999.0 | 150 | 1 | 161 | 2 | 13437.68995 | 1 | 4 | Mid-Value |
1129 rows × 10 columns
# Create a dataframe with CustomerID and Transaction_Date
df_day_order = df_first9m[['CustomerID', 'Transaction_Date']].copy()
# Convert Transaction_Date to day
df_day_order['TransactionDay'] = pd.to_datetime(df_first9m['Transaction_Date']).dt.date
df_day_order = df_day_order.sort_values(['CustomerID', 'Transaction_Date'])
# Drop duplicates
df_day_order = df_day_order.drop_duplicates(subset=['CustomerID', 'TransactionDay'], keep='first')
# shift last 3 purchase day
df_day_order['PrevPurchaseDate'] = df_day_order.groupby('CustomerID')['TransactionDay'].shift(1)
df_day_order['T2PurchaseDate'] = df_day_order.groupby('CustomerID')['TransactionDay'].shift(2)
df_day_order['T3PurchaseDate'] = df_day_order.groupby('CustomerID')['TransactionDay'].shift(3)
df_day_order
| CustomerID | Transaction_Date | TransactionDay | PrevPurchaseDate | T2PurchaseDate | T3PurchaseDate | |
|---|---|---|---|---|---|---|
| 22739 | 12347 | 2019-03-24 | 2019-03-24 | NaN | NaN | NaN |
| 12512 | 12348 | 2019-06-22 | 2019-06-22 | NaN | NaN | NaN |
| 26948 | 12359 | 2019-08-23 | 2019-08-23 | NaN | NaN | NaN |
| 20783 | 12370 | 2019-05-26 | 2019-05-26 | NaN | NaN | NaN |
| 20866 | 12370 | 2019-06-25 | 2019-06-25 | 2019-05-26 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... |
| 20061 | 18260 | 2019-06-22 | 2019-06-22 | NaN | NaN | NaN |
| 20072 | 18260 | 2019-08-11 | 2019-08-11 | 2019-06-22 | NaN | NaN |
| 8295 | 18269 | 2019-04-05 | 2019-04-05 | NaN | NaN | NaN |
| 8302 | 18269 | 2019-06-20 | 2019-06-20 | 2019-04-05 | NaN | NaN |
| 27039 | 18283 | 2019-07-29 | 2019-07-29 | NaN | NaN | NaN |
2197 rows × 6 columns
# Number of days difference between last 3 purchase
df_day_order['DayDiff1'] = (df_day_order['TransactionDay'] - df_day_order['PrevPurchaseDate']).dt.days
df_day_order['DayDiff2'] = (df_day_order['TransactionDay'] - df_day_order['T2PurchaseDate']).dt.days
df_day_order['DayDiff3'] = (df_day_order['TransactionDay'] - df_day_order['T3PurchaseDate']).dt.days
df_day_order.head()
| CustomerID | Transaction_Date | TransactionDay | PrevPurchaseDate | T2PurchaseDate | T3PurchaseDate | DayDiff1 | DayDiff2 | DayDiff3 | |
|---|---|---|---|---|---|---|---|---|---|
| 22739 | 12347 | 2019-03-24 | 2019-03-24 | NaN | NaN | NaN | NaN | NaN | NaN |
| 12512 | 12348 | 2019-06-22 | 2019-06-22 | NaN | NaN | NaN | NaN | NaN | NaN |
| 26948 | 12359 | 2019-08-23 | 2019-08-23 | NaN | NaN | NaN | NaN | NaN | NaN |
| 20783 | 12370 | 2019-05-26 | 2019-05-26 | NaN | NaN | NaN | NaN | NaN | NaN |
| 20866 | 12370 | 2019-06-25 | 2019-06-25 | 2019-05-26 | NaN | NaN | 30.0 | NaN | NaN |
df_day_diff = df_day_order.groupby('CustomerID').agg({'DayDiff1':['mean', 'std']}).reset_index()
df_day_diff.columns = ['CustomerID', 'DayDiffMean', 'DayDiffStd']
df_day_diff.head()
| CustomerID | DayDiffMean | DayDiffStd | |
|---|---|---|---|
| 0 | 12347 | NaN | NaN |
| 1 | 12348 | NaN | NaN |
| 2 | 12359 | NaN | NaN |
| 3 | 12370 | 30.0 | NaN |
| 4 | 12377 | NaN | NaN |
df_day_order_last = df_day_order.drop_duplicates(subset=['CustomerID'], keep = 'last')
df_day_order_last = df_day_order_last.dropna()
df_day_order_last
| CustomerID | Transaction_Date | TransactionDay | PrevPurchaseDate | T2PurchaseDate | T3PurchaseDate | DayDiff1 | DayDiff2 | DayDiff3 | |
|---|---|---|---|---|---|---|---|---|---|
| 2279 | 12471 | 2019-08-28 | 2019-08-28 | 2019-08-15 | 2019-07-28 | 2019-07-20 | 13.0 | 31.0 | 39.0 |
| 3446 | 12481 | 2019-08-02 | 2019-08-02 | 2019-07-10 | 2019-04-12 | 2019-04-11 | 23.0 | 112.0 | 113.0 |
| 10779 | 12494 | 2019-07-20 | 2019-07-20 | 2019-06-14 | 2019-06-13 | 2019-04-12 | 36.0 | 37.0 | 99.0 |
| 30443 | 12647 | 2019-06-26 | 2019-06-26 | 2019-06-09 | 2019-03-21 | 2019-02-25 | 17.0 | 97.0 | 121.0 |
| 12078 | 12681 | 2019-08-15 | 2019-08-15 | 2019-07-22 | 2019-07-09 | 2019-03-20 | 24.0 | 37.0 | 148.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17201 | 18041 | 2019-07-31 | 2019-07-31 | 2019-07-30 | 2019-04-16 | 2019-01-19 | 1.0 | 106.0 | 193.0 |
| 15416 | 18055 | 2019-06-15 | 2019-06-15 | 2019-06-14 | 2019-03-19 | 2019-02-11 | 1.0 | 88.0 | 124.0 |
| 18749 | 18092 | 2019-08-29 | 2019-08-29 | 2019-07-14 | 2019-07-06 | 2019-04-11 | 46.0 | 54.0 | 140.0 |
| 14042 | 18109 | 2019-08-23 | 2019-08-23 | 2019-07-21 | 2019-05-02 | 2019-02-12 | 33.0 | 113.0 | 192.0 |
| 10196 | 18116 | 2019-07-16 | 2019-07-16 | 2019-07-15 | 2019-06-18 | 2019-06-17 | 1.0 | 28.0 | 29.0 |
118 rows × 9 columns
df_day_order_last = pd.merge(df_day_order_last, df_day_diff, on = 'CustomerID')
df_user = pd.merge(df_user, df_day_order_last[['CustomerID', 'DayDiff1', 'DayDiff2', 'DayDiff3', 'DayDiffMean', 'DayDiffStd' ]])
df_user.describe()
| CustomerID | NextPurchaseDate | Recency | RecencyScore | Frequency | FrequencyScore | Monetary | MonetaryScore | OverallScore | DayDiff1 | DayDiff2 | DayDiff3 | DayDiffMean | DayDiffStd | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 118.000000 | 118.000000 | 118.000000 | 118.000000 | 118.000000 | 118.000000 | 118.000000 | 118.000000 | 118.000000 | 118.000000 | 118.000000 | 118.000000 | 118.000000 | 118.000000 |
| mean | 15265.635593 | 373.423729 | 32.949153 | 2.771186 | 90.559322 | 1.254237 | 9159.489043 | 0.711864 | 4.737288 | 26.610169 | 60.508475 | 103.703390 | 37.572460 | 35.809408 |
| std | 1675.704606 | 434.726644 | 36.669661 | 0.529651 | 93.810642 | 0.859203 | 11761.162986 | 0.705312 | 1.565810 | 30.447667 | 41.136576 | 56.838248 | 16.708565 | 19.919037 |
| min | 12471.000000 | 1.000000 | 0.000000 | 0.000000 | 6.000000 | 0.000000 | 395.882300 | 0.000000 | 1.000000 | 1.000000 | 2.000000 | 3.000000 | 1.000000 | 0.000000 |
| 25% | 13782.250000 | 56.000000 | 8.000000 | 3.000000 | 35.250000 | 1.000000 | 2837.539815 | 0.000000 | 3.000000 | 2.000000 | 29.250000 | 57.000000 | 26.437500 | 20.752457 |
| 50% | 15103.500000 | 103.500000 | 18.000000 | 3.000000 | 61.500000 | 1.000000 | 6131.462435 | 1.000000 | 5.000000 | 16.000000 | 50.500000 | 93.500000 | 37.000000 | 33.549336 |
| 75% | 16764.000000 | 999.000000 | 44.750000 | 3.000000 | 115.000000 | 2.000000 | 10861.316430 | 1.000000 | 6.000000 | 38.750000 | 88.750000 | 148.750000 | 49.000000 | 45.539934 |
| max | 18116.000000 | 999.000000 | 217.000000 | 3.000000 | 684.000000 | 3.000000 | 86143.004980 | 3.000000 | 9.000000 | 146.000000 | 198.000000 | 227.000000 | 75.666667 | 92.376043 |
df_npd = df_user.copy()
df_npd = pd.get_dummies(df_npd)
# Define categories for dependent variable
df_npd['NextPurchaseDayRange'] = '0-30 days'
df_npd.loc[(df_npd['NextPurchaseDate'] > 30) & (df_npd['NextPurchaseDate'] <= 60), 'NextPurchaseDayRange'] = '30-60 days'
df_npd.loc[(df_npd['NextPurchaseDate'] > 60) & (df_npd['NextPurchaseDate'] <= 90), 'NextPurchaseDayRange'] = '60-90 days'
df_npd.loc[df_npd['NextPurchaseDate'] > 90, 'NextPurchaseDayRange'] = '90+ days'
df_npd.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 118 entries, 0 to 117 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 118 non-null int64 1 NextPurchaseDate 118 non-null float64 2 Recency 118 non-null int64 3 RecencyScore 118 non-null int64 4 Frequency 118 non-null int64 5 FrequencyScore 118 non-null int64 6 Monetary 118 non-null float64 7 MonetaryScore 118 non-null int64 8 OverallScore 118 non-null int64 9 DayDiff1 118 non-null float64 10 DayDiff2 118 non-null float64 11 DayDiff3 118 non-null float64 12 DayDiffMean 118 non-null float64 13 DayDiffStd 118 non-null float64 14 Segment_High-Value 118 non-null uint8 15 Segment_Low-Value 118 non-null uint8 16 Segment_Mid-Value 118 non-null uint8 17 NextPurchaseDayRange 118 non-null object dtypes: float64(7), int64(7), object(1), uint8(3) memory usage: 15.1+ KB
# Drop unnecessary variables to avoid multicollinearity
columns_to_keep = ['CustomerID', 'NextPurchaseDayRange','Segment_High-Value', 'Segment_Low-Value', 'Segment_Mid-Value',
'DayDiffMean', 'DayDiffStd', 'OverallScore']
df_npd = df_npd[columns_to_keep]
df_npd
| CustomerID | NextPurchaseDayRange | Segment_High-Value | Segment_Low-Value | Segment_Mid-Value | DayDiffMean | DayDiffStd | OverallScore | |
|---|---|---|---|---|---|---|---|---|
| 0 | 17850 | 90+ days | 0 | 0 | 1 | 2.777778 | 3.527668 | 5 |
| 1 | 14606 | 0-30 days | 1 | 0 | 0 | 14.666667 | 17.128367 | 8 |
| 2 | 14911 | 30-60 days | 1 | 0 | 0 | 12.529412 | 13.505173 | 8 |
| 3 | 17841 | 0-30 days | 1 | 0 | 0 | 18.666667 | 15.310919 | 8 |
| 4 | 13081 | 90+ days | 1 | 0 | 0 | 40.600000 | 70.009285 | 8 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 113 | 16782 | 90+ days | 0 | 0 | 1 | 26.333333 | 15.044379 | 5 |
| 114 | 14796 | 90+ days | 0 | 0 | 1 | 15.666667 | 25.403412 | 5 |
| 115 | 15570 | 60-90 days | 0 | 0 | 1 | 25.800000 | 34.426734 | 5 |
| 116 | 15356 | 90+ days | 0 | 0 | 1 | 23.666667 | 39.259818 | 5 |
| 117 | 17827 | 90+ days | 0 | 0 | 1 | 1.000000 | 0.000000 | 4 |
118 rows × 8 columns
# Split the data into features and target
feature_cols = ['Segment_High-Value', 'Segment_Low-Value', 'Segment_Mid-Value',
'DayDiffMean', 'DayDiffStd', 'OverallScore']
X = df_npd[feature_cols]
y = df_npd['NextPurchaseDayRange']
# view the percantage of each class
class_counts = df_npd.NextPurchaseDayRange.value_counts()/len(df_npd)*100
# Visualize
# Set style
sb.set_style("whitegrid")
# Set font
plt.rcParams['font.family'] = 'serif'
# Plot the bar plot
plt.figure(figsize=(10, 6))
sb.barplot(x=class_counts.index, y=class_counts.values, color = "#0059b3")
# Add labels and title
plt.title('Proportion of Each Class in Next Purchase Day Range', fontsize=16)
plt.xlabel('Next Purchase Day Range', fontsize=14)
plt.ylabel('Percentage', fontsize=14)
# Add percentages to the bars
for i, v in enumerate(class_counts.values):
plt.text(i, v + 0.5, f'{v:.2f}%', ha='center', va='bottom', fontsize=12, color='black')
# Rotate x-axis labels for better readability
plt.xticks(rotation=45)
# Show the plot
plt.tight_layout()
plt.show()
# Apply SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)
# Convert back to DataFrame
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
# Splitting the dataset into training (50%) and testing (50%)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.5, random_state=42)
# Create the model, using the 'lbfgs' solver for multinomial logistic regression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
# Fit the model to the training data
y_score = model.fit(X_train, y_train).predict_proba(X_test)
# Predict the labels for the test set
y_pred = model.predict(X_test)
# Calculate and print the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
Accuracy: 0.37
# Initialize the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
# Fit the classifier to the training data
rf.fit(X_train, y_train)
# Predict on the test data
y_pred = rf.predict(X_test)
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classif_report = classification_report(y_test, y_pred)
# Print the results
print(f"Random Forest Model Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classif_report)
Random Forest Model Accuracy: 0.58
Confusion Matrix:
[[29 1 2 3]
[ 2 21 6 3]
[ 8 5 17 2]
[10 10 4 11]]
Classification Report:
precision recall f1-score support
0-30 days 0.59 0.83 0.69 35
30-60 days 0.57 0.66 0.61 32
60-90 days 0.59 0.53 0.56 32
90+ days 0.58 0.31 0.41 35
accuracy 0.58 134
macro avg 0.58 0.58 0.57 134
weighted avg 0.58 0.58 0.57 134
# Encode string class labels to integers
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
# Fit the XGBoost model using the encoded labels
xgb_classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_classifier.fit(X_train, y_train_encoded)
# Predict on the test data
y_pred_encoded = xgb_classifier.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded) # Convert predictions back to original labels
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classif_report = classification_report(y_test, y_pred)
# Print the results
print(f"XGBoost Model Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classif_report)
XGBoost Model Accuracy: 0.57
Confusion Matrix:
[[31 1 1 2]
[ 5 18 5 4]
[ 9 3 18 2]
[ 8 12 5 10]]
Classification Report:
precision recall f1-score support
0-30 days 0.58 0.89 0.70 35
30-60 days 0.53 0.56 0.55 32
60-90 days 0.62 0.56 0.59 32
90+ days 0.56 0.29 0.38 35
accuracy 0.57 134
macro avg 0.57 0.57 0.55 134
weighted avg 0.57 0.57 0.55 134